mindspore 2.7.0rc1__cp310-cp310-win_amd64.whl → 2.7.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (370) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +5 -2
  3. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +2 -2
  7. mindspore/_extends/builtin_operations.py +3 -3
  8. mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
  9. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  10. mindspore/_extends/parse/__init__.py +3 -3
  11. mindspore/_extends/parse/compile_config.py +24 -1
  12. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +6 -3
  13. mindspore/_extends/parse/parser.py +28 -22
  14. mindspore/_extends/parse/resources.py +1 -1
  15. mindspore/_extends/parse/standard_method.py +23 -2
  16. mindspore/_extends/parse/trope.py +2 -1
  17. mindspore/_extends/pijit/pijit_func_white_list.py +9 -27
  18. mindspore/amp.py +0 -18
  19. mindspore/avcodec-59.dll +0 -0
  20. mindspore/avdevice-59.dll +0 -0
  21. mindspore/avfilter-8.dll +0 -0
  22. mindspore/avformat-59.dll +0 -0
  23. mindspore/avutil-57.dll +0 -0
  24. mindspore/boost/base.py +29 -2
  25. mindspore/common/__init__.py +18 -12
  26. mindspore/common/_decorator.py +3 -2
  27. mindspore/common/_grad_function.py +3 -1
  28. mindspore/common/_tensor_cpp_method.py +1 -1
  29. mindspore/common/_tensor_docs.py +371 -96
  30. mindspore/common/_utils.py +7 -43
  31. mindspore/common/api.py +434 -135
  32. mindspore/common/dtype.py +98 -57
  33. mindspore/common/dump.py +7 -108
  34. mindspore/common/dynamic_shape/__init__.py +0 -0
  35. mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +15 -23
  36. mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
  37. mindspore/common/file_system.py +59 -9
  38. mindspore/common/hook_handle.py +82 -3
  39. mindspore/common/jit_config.py +5 -1
  40. mindspore/common/jit_trace.py +27 -12
  41. mindspore/common/lazy_inline.py +5 -3
  42. mindspore/common/np_dtype.py +3 -3
  43. mindspore/common/parameter.py +17 -127
  44. mindspore/common/recompute.py +4 -13
  45. mindspore/common/tensor.py +50 -217
  46. mindspore/communication/_comm_helper.py +11 -1
  47. mindspore/communication/comm_func.py +138 -4
  48. mindspore/communication/management.py +85 -1
  49. mindspore/config/op_info.config +0 -15
  50. mindspore/context.py +20 -106
  51. mindspore/dataset/__init__.py +1 -1
  52. mindspore/dataset/audio/transforms.py +1 -1
  53. mindspore/dataset/core/config.py +35 -1
  54. mindspore/dataset/engine/datasets.py +338 -319
  55. mindspore/dataset/engine/datasets_user_defined.py +38 -22
  56. mindspore/dataset/engine/datasets_vision.py +1 -1
  57. mindspore/dataset/engine/validators.py +1 -15
  58. mindspore/dataset/transforms/c_transforms.py +2 -2
  59. mindspore/dataset/transforms/transforms.py +3 -3
  60. mindspore/dataset/vision/__init__.py +1 -1
  61. mindspore/dataset/vision/py_transforms.py +8 -8
  62. mindspore/dataset/vision/transforms.py +17 -5
  63. mindspore/dataset/vision/utils.py +632 -21
  64. mindspore/device_context/ascend/op_tuning.py +35 -1
  65. mindspore/dnnl.dll +0 -0
  66. mindspore/{profiler/common/validator → graph}/__init__.py +9 -1
  67. mindspore/graph/custom_pass.py +55 -0
  68. mindspore/include/api/cell.h +28 -4
  69. mindspore/include/api/cfg.h +24 -7
  70. mindspore/include/api/context.h +1 -0
  71. mindspore/include/api/delegate.h +0 -2
  72. mindspore/include/api/dual_abi_helper.h +100 -19
  73. mindspore/include/api/graph.h +14 -1
  74. mindspore/include/api/kernel.h +16 -3
  75. mindspore/include/api/kernel_api.h +9 -1
  76. mindspore/include/api/metrics/accuracy.h +9 -0
  77. mindspore/include/api/model.h +5 -1
  78. mindspore/include/api/model_group.h +4 -0
  79. mindspore/include/api/model_parallel_runner.h +2 -0
  80. mindspore/include/api/status.h +48 -10
  81. mindspore/include/api/types.h +6 -1
  82. mindspore/include/dataset/constants.h +9 -0
  83. mindspore/include/dataset/execute.h +2 -2
  84. mindspore/jpeg62.dll +0 -0
  85. mindspore/mindrecord/__init__.py +3 -3
  86. mindspore/mindrecord/common/exceptions.py +1 -0
  87. mindspore/mindrecord/config.py +1 -1
  88. mindspore/{parallel/mpi → mindrecord/core}/__init__.py +4 -1
  89. mindspore/mindrecord/{shardheader.py → core/shardheader.py} +2 -1
  90. mindspore/mindrecord/{shardindexgenerator.py → core/shardindexgenerator.py} +1 -1
  91. mindspore/mindrecord/{shardreader.py → core/shardreader.py} +2 -1
  92. mindspore/mindrecord/{shardsegment.py → core/shardsegment.py} +2 -2
  93. mindspore/mindrecord/{shardutils.py → core/shardutils.py} +1 -1
  94. mindspore/mindrecord/{shardwriter.py → core/shardwriter.py} +1 -1
  95. mindspore/mindrecord/filereader.py +4 -4
  96. mindspore/mindrecord/filewriter.py +5 -5
  97. mindspore/mindrecord/mindpage.py +2 -2
  98. mindspore/mindrecord/tools/cifar10.py +4 -3
  99. mindspore/mindrecord/tools/cifar100.py +1 -1
  100. mindspore/mindrecord/tools/cifar100_to_mr.py +1 -1
  101. mindspore/mindrecord/tools/cifar10_to_mr.py +6 -6
  102. mindspore/mindrecord/tools/csv_to_mr.py +1 -1
  103. mindspore/mindrecord/tools/imagenet_to_mr.py +1 -1
  104. mindspore/mindrecord/tools/mnist_to_mr.py +1 -1
  105. mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -1
  106. mindspore/mindspore_backend_common.dll +0 -0
  107. mindspore/mindspore_backend_manager.dll +0 -0
  108. mindspore/mindspore_cluster.dll +0 -0
  109. mindspore/mindspore_common.dll +0 -0
  110. mindspore/mindspore_core.dll +0 -0
  111. mindspore/mindspore_cpu.dll +0 -0
  112. mindspore/mindspore_dump.dll +0 -0
  113. mindspore/mindspore_frontend.dll +0 -0
  114. mindspore/mindspore_glog.dll +0 -0
  115. mindspore/mindspore_hardware_abstract.dll +0 -0
  116. mindspore/mindspore_memory_pool.dll +0 -0
  117. mindspore/mindspore_ms_backend.dll +0 -0
  118. mindspore/mindspore_ops.dll +0 -0
  119. mindspore/{mindspore_ops_host.dll → mindspore_ops_cpu.dll} +0 -0
  120. mindspore/mindspore_profiler.dll +0 -0
  121. mindspore/mindspore_pyboost.dll +0 -0
  122. mindspore/mindspore_pynative.dll +0 -0
  123. mindspore/mindspore_runtime_pipeline.dll +0 -0
  124. mindspore/mindspore_runtime_utils.dll +0 -0
  125. mindspore/mindspore_tools.dll +0 -0
  126. mindspore/mint/__init__.py +15 -10
  127. mindspore/mint/distributed/__init__.py +4 -0
  128. mindspore/mint/distributed/distributed.py +392 -69
  129. mindspore/mint/nn/__init__.py +2 -16
  130. mindspore/mint/nn/functional.py +4 -110
  131. mindspore/mint/nn/layer/__init__.py +0 -2
  132. mindspore/mint/nn/layer/_functions.py +1 -2
  133. mindspore/mint/nn/layer/activation.py +0 -6
  134. mindspore/mint/nn/layer/basic.py +0 -47
  135. mindspore/mint/nn/layer/conv.py +10 -10
  136. mindspore/mint/nn/layer/normalization.py +11 -16
  137. mindspore/mint/nn/layer/pooling.py +0 -4
  138. mindspore/nn/__init__.py +1 -3
  139. mindspore/nn/cell.py +231 -239
  140. mindspore/nn/layer/activation.py +4 -2
  141. mindspore/nn/layer/basic.py +56 -14
  142. mindspore/nn/layer/container.py +16 -0
  143. mindspore/nn/layer/embedding.py +4 -169
  144. mindspore/nn/layer/image.py +1 -1
  145. mindspore/nn/layer/normalization.py +2 -1
  146. mindspore/nn/layer/thor_layer.py +4 -85
  147. mindspore/nn/optim/ada_grad.py +0 -1
  148. mindspore/nn/optim/adafactor.py +0 -1
  149. mindspore/nn/optim/adam.py +32 -127
  150. mindspore/nn/optim/adamax.py +0 -1
  151. mindspore/nn/optim/asgd.py +0 -1
  152. mindspore/nn/optim/ftrl.py +8 -102
  153. mindspore/nn/optim/lamb.py +1 -4
  154. mindspore/nn/optim/lars.py +0 -3
  155. mindspore/nn/optim/lazyadam.py +25 -218
  156. mindspore/nn/optim/momentum.py +5 -43
  157. mindspore/nn/optim/optimizer.py +6 -55
  158. mindspore/nn/optim/proximal_ada_grad.py +0 -1
  159. mindspore/nn/optim/rmsprop.py +0 -1
  160. mindspore/nn/optim/rprop.py +0 -1
  161. mindspore/nn/optim/sgd.py +0 -1
  162. mindspore/nn/optim/tft_wrapper.py +2 -4
  163. mindspore/nn/optim/thor.py +0 -2
  164. mindspore/nn/probability/bijector/bijector.py +7 -8
  165. mindspore/nn/probability/bijector/gumbel_cdf.py +2 -2
  166. mindspore/nn/probability/bijector/power_transform.py +20 -21
  167. mindspore/nn/probability/bijector/scalar_affine.py +5 -5
  168. mindspore/nn/probability/bijector/softplus.py +13 -14
  169. mindspore/nn/probability/distribution/_utils/utils.py +2 -2
  170. mindspore/nn/wrap/cell_wrapper.py +39 -5
  171. mindspore/nn/wrap/grad_reducer.py +4 -89
  172. mindspore/numpy/array_creations.py +4 -4
  173. mindspore/numpy/fft.py +9 -9
  174. mindspore/numpy/utils_const.py +1 -1
  175. mindspore/{nn/reinforcement → onnx}/__init__.py +5 -8
  176. mindspore/onnx/onnx_export.py +137 -0
  177. mindspore/opencv_core4110.dll +0 -0
  178. mindspore/opencv_imgcodecs4110.dll +0 -0
  179. mindspore/{opencv_imgproc452.dll → opencv_imgproc4110.dll} +0 -0
  180. mindspore/ops/__init__.py +2 -0
  181. mindspore/ops/_grad_experimental/grad_comm_ops.py +38 -2
  182. mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
  183. mindspore/ops/_op_impl/aicpu/__init__.py +0 -10
  184. mindspore/ops/_op_impl/cpu/__init__.py +1 -5
  185. mindspore/ops/_op_impl/cpu/{buffer_append.py → joinedstr_op.py} +8 -8
  186. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +28 -24
  187. mindspore/ops/auto_generate/gen_extend_func.py +6 -11
  188. mindspore/ops/auto_generate/gen_ops_def.py +385 -154
  189. mindspore/ops/auto_generate/gen_ops_prim.py +5676 -5167
  190. mindspore/ops/communication.py +97 -0
  191. mindspore/ops/composite/__init__.py +5 -2
  192. mindspore/ops/composite/base.py +16 -2
  193. mindspore/ops/composite/multitype_ops/__init__.py +3 -1
  194. mindspore/ops/composite/multitype_ops/_compile_utils.py +150 -8
  195. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
  196. mindspore/ops/composite/multitype_ops/add_impl.py +7 -0
  197. mindspore/ops/composite/multitype_ops/mod_impl.py +27 -0
  198. mindspore/ops/function/__init__.py +2 -0
  199. mindspore/ops/function/array_func.py +24 -18
  200. mindspore/ops/function/comm_func.py +3883 -0
  201. mindspore/ops/function/debug_func.py +7 -6
  202. mindspore/ops/function/grad/grad_func.py +4 -12
  203. mindspore/ops/function/math_func.py +89 -86
  204. mindspore/ops/function/nn_func.py +92 -313
  205. mindspore/ops/function/random_func.py +9 -18
  206. mindspore/ops/functional.py +4 -1
  207. mindspore/ops/functional_overload.py +377 -30
  208. mindspore/ops/operations/__init__.py +2 -5
  209. mindspore/ops/operations/_custom_ops_utils.py +7 -9
  210. mindspore/ops/operations/_inner_ops.py +12 -50
  211. mindspore/ops/operations/_rl_inner_ops.py +0 -933
  212. mindspore/ops/operations/array_ops.py +5 -50
  213. mindspore/ops/operations/comm_ops.py +95 -17
  214. mindspore/ops/operations/custom_ops.py +237 -22
  215. mindspore/ops/operations/debug_ops.py +33 -35
  216. mindspore/ops/operations/manually_defined/ops_def.py +39 -318
  217. mindspore/ops/operations/math_ops.py +5 -5
  218. mindspore/ops/operations/nn_ops.py +3 -3
  219. mindspore/ops/operations/sparse_ops.py +0 -83
  220. mindspore/ops/primitive.py +4 -27
  221. mindspore/ops/tensor_method.py +88 -10
  222. mindspore/ops_generate/aclnn/aclnn_kernel_register_auto_cc_generator.py +5 -5
  223. mindspore/ops_generate/aclnn/gen_aclnn_implement.py +8 -8
  224. mindspore/ops_generate/api/functions_cc_generator.py +53 -4
  225. mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +25 -11
  226. mindspore/ops_generate/common/gen_constants.py +11 -10
  227. mindspore/ops_generate/common/op_proto.py +18 -1
  228. mindspore/ops_generate/common/template.py +102 -245
  229. mindspore/ops_generate/common/template_utils.py +212 -0
  230. mindspore/ops_generate/gen_custom_ops.py +69 -0
  231. mindspore/ops_generate/op_def/ops_def_cc_generator.py +78 -7
  232. mindspore/ops_generate/op_def_py/base_op_prim_py_generator.py +360 -0
  233. mindspore/ops_generate/op_def_py/custom_op_prim_py_generator.py +140 -0
  234. mindspore/ops_generate/op_def_py/op_def_py_generator.py +54 -7
  235. mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -312
  236. mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +74 -17
  237. mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +22 -5
  238. mindspore/ops_generate/pyboost/gen_pyboost_func.py +0 -16
  239. mindspore/ops_generate/pyboost/op_template_parser.py +3 -2
  240. mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +21 -5
  241. mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +2 -2
  242. mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +30 -10
  243. mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +10 -3
  244. mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +1 -1
  245. mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +19 -9
  246. mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +71 -28
  247. mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +10 -9
  248. mindspore/ops_generate/pyboost/pyboost_utils.py +27 -16
  249. mindspore/ops_generate/resources/yaml_loader.py +13 -0
  250. mindspore/ops_generate/tensor_py_cc_generator.py +2 -2
  251. mindspore/parallel/_auto_parallel_context.py +5 -15
  252. mindspore/parallel/_cell_wrapper.py +1 -1
  253. mindspore/parallel/_parallel_serialization.py +4 -6
  254. mindspore/parallel/_ps_context.py +2 -2
  255. mindspore/parallel/_utils.py +34 -17
  256. mindspore/parallel/auto_parallel.py +23 -9
  257. mindspore/parallel/checkpoint_transform.py +20 -2
  258. mindspore/parallel/cluster/process_entity/_api.py +28 -33
  259. mindspore/parallel/cluster/process_entity/_utils.py +9 -5
  260. mindspore/parallel/cluster/run.py +5 -3
  261. mindspore/{experimental/llm_boost/ascend_native → parallel/distributed}/__init__.py +21 -22
  262. mindspore/parallel/distributed/distributed_data_parallel.py +393 -0
  263. mindspore/parallel/distributed/flatten_grad_buffer.py +295 -0
  264. mindspore/parallel/function/reshard_func.py +6 -5
  265. mindspore/parallel/nn/parallel_cell_wrapper.py +40 -3
  266. mindspore/parallel/nn/parallel_grad_reducer.py +0 -8
  267. mindspore/parallel/shard.py +7 -21
  268. mindspore/parallel/strategy.py +336 -0
  269. mindspore/parallel/transform_safetensors.py +127 -20
  270. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +13 -9
  271. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
  272. mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +1 -1
  273. mindspore/profiler/common/constant.py +5 -0
  274. mindspore/profiler/common/file_manager.py +9 -0
  275. mindspore/profiler/common/msprof_cmd_tool.py +40 -4
  276. mindspore/profiler/common/path_manager.py +65 -24
  277. mindspore/profiler/common/profiler_context.py +27 -14
  278. mindspore/profiler/common/profiler_info.py +3 -3
  279. mindspore/profiler/common/profiler_meta_data.py +1 -0
  280. mindspore/profiler/common/profiler_op_analyse.py +10 -6
  281. mindspore/profiler/common/profiler_path_manager.py +13 -0
  282. mindspore/profiler/common/util.py +30 -3
  283. mindspore/profiler/dynamic_profiler.py +91 -46
  284. mindspore/profiler/envprofiler.py +30 -5
  285. mindspore/profiler/experimental_config.py +18 -2
  286. mindspore/profiler/platform/cpu_profiler.py +10 -4
  287. mindspore/profiler/platform/npu_profiler.py +34 -7
  288. mindspore/profiler/profiler.py +193 -145
  289. mindspore/profiler/profiler_action_controller.py +1 -1
  290. mindspore/profiler/profiler_interface.py +2 -2
  291. mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
  292. mindspore/run_check/_check_version.py +108 -24
  293. mindspore/runtime/__init__.py +9 -6
  294. mindspore/runtime/executor.py +35 -0
  295. mindspore/runtime/memory.py +113 -0
  296. mindspore/runtime/thread_bind_core.py +1 -1
  297. mindspore/swresample-4.dll +0 -0
  298. mindspore/swscale-6.dll +0 -0
  299. mindspore/tinyxml2.dll +0 -0
  300. mindspore/{experimental/llm_boost → tools}/__init__.py +5 -5
  301. mindspore/tools/data_dump.py +130 -0
  302. mindspore/tools/sdc_detect.py +91 -0
  303. mindspore/tools/stress_detect.py +63 -0
  304. mindspore/train/__init__.py +6 -6
  305. mindspore/train/_utils.py +8 -21
  306. mindspore/train/amp.py +6 -7
  307. mindspore/train/callback/_callback.py +2 -1
  308. mindspore/train/callback/_checkpoint.py +1 -17
  309. mindspore/train/callback/_flops_collector.py +10 -6
  310. mindspore/train/callback/_train_fault_tolerance.py +72 -25
  311. mindspore/train/data_sink.py +5 -9
  312. mindspore/train/dataset_helper.py +5 -5
  313. mindspore/train/model.py +41 -230
  314. mindspore/train/serialization.py +160 -401
  315. mindspore/train/train_thor/model_thor.py +2 -2
  316. mindspore/turbojpeg.dll +0 -0
  317. mindspore/utils/__init__.py +6 -3
  318. mindspore/utils/dlpack.py +92 -0
  319. mindspore/utils/dryrun.py +1 -1
  320. mindspore/utils/runtime_execution_order_check.py +10 -0
  321. mindspore/utils/sdc_detect.py +14 -12
  322. mindspore/utils/stress_detect.py +43 -0
  323. mindspore/utils/utils.py +152 -16
  324. mindspore/version.py +1 -1
  325. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/METADATA +3 -2
  326. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/RECORD +330 -344
  327. mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
  328. mindspore/communication/_hccl_management.py +0 -297
  329. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -207
  330. mindspore/experimental/llm_boost/ascend_native/llm_boost.py +0 -52
  331. mindspore/experimental/llm_boost/atb/__init__.py +0 -23
  332. mindspore/experimental/llm_boost/atb/boost_base.py +0 -385
  333. mindspore/experimental/llm_boost/atb/llama_boost.py +0 -137
  334. mindspore/experimental/llm_boost/atb/qwen_boost.py +0 -124
  335. mindspore/experimental/llm_boost/register.py +0 -130
  336. mindspore/experimental/llm_boost/utils.py +0 -31
  337. mindspore/include/OWNERS +0 -7
  338. mindspore/mindspore_cpu_res_manager.dll +0 -0
  339. mindspore/mindspore_ops_kernel_common.dll +0 -0
  340. mindspore/mindspore_res_manager.dll +0 -0
  341. mindspore/nn/optim/_dist_optimizer_registry.py +0 -111
  342. mindspore/nn/reinforcement/_batch_read_write.py +0 -142
  343. mindspore/nn/reinforcement/_tensors_queue.py +0 -152
  344. mindspore/nn/reinforcement/tensor_array.py +0 -145
  345. mindspore/opencv_core452.dll +0 -0
  346. mindspore/opencv_imgcodecs452.dll +0 -0
  347. mindspore/ops/_op_impl/aicpu/priority_replay_buffer.py +0 -113
  348. mindspore/ops/_op_impl/aicpu/reservoir_replay_buffer.py +0 -96
  349. mindspore/ops/_op_impl/aicpu/sparse_cross.py +0 -42
  350. mindspore/ops/_op_impl/cpu/buffer_get.py +0 -28
  351. mindspore/ops/_op_impl/cpu/buffer_sample.py +0 -28
  352. mindspore/ops/_op_impl/cpu/priority_replay_buffer.py +0 -42
  353. mindspore/ops/operations/_tensor_array.py +0 -359
  354. mindspore/ops/operations/rl_ops.py +0 -288
  355. mindspore/parallel/_offload_context.py +0 -275
  356. mindspore/parallel/_recovery_context.py +0 -115
  357. mindspore/parallel/_transformer/__init__.py +0 -35
  358. mindspore/parallel/_transformer/layers.py +0 -765
  359. mindspore/parallel/_transformer/loss.py +0 -251
  360. mindspore/parallel/_transformer/moe.py +0 -693
  361. mindspore/parallel/_transformer/op_parallel_config.py +0 -222
  362. mindspore/parallel/_transformer/transformer.py +0 -3124
  363. mindspore/parallel/mpi/_mpi_config.py +0 -116
  364. mindspore/profiler/common/validator/validate_path.py +0 -84
  365. mindspore/train/memory_profiling_pb2.py +0 -298
  366. mindspore/utils/hooks.py +0 -81
  367. /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
  368. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/WHEEL +0 -0
  369. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/entry_points.txt +0 -0
  370. {mindspore-2.7.0rc1.dist-info → mindspore-2.7.1.dist-info}/top_level.txt +0 -0
@@ -29,10 +29,8 @@ import atexit
29
29
  import glob
30
30
  import json
31
31
  import os
32
- import queue
33
32
  import signal
34
33
  import stat
35
- import subprocess
36
34
  import warnings
37
35
 
38
36
  import time
@@ -41,6 +39,7 @@ import multiprocessing
41
39
  from importlib import import_module
42
40
  import sys
43
41
  import threading
42
+ from types import GeneratorType
44
43
 
45
44
  import copy
46
45
  import weakref
@@ -65,7 +64,6 @@ from mindspore.dataset.engine import samplers
65
64
  from mindspore.dataset.engine.samplers import Shuffle
66
65
  from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
67
66
  ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
68
- from .queue import _SharedQueue, _Queue
69
67
  from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
70
68
  check_rename, check_device_send, check_take, check_output_shape, check_project, \
71
69
  check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
@@ -73,7 +71,8 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
73
71
  check_total_batch, check_sync_update
74
72
  from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
75
73
  get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, \
76
- _get_debug_hook_list, get_multiprocessing_start_method
74
+ _get_debug_hook_list, get_multiprocessing_start_method, get_video_backend, set_video_backend, \
75
+ get_error_samples_mode, ErrorSamplesMode
77
76
  from ..core.datatypes import mstype_to_detype
78
77
  from ..core.validator_helpers import replace_none
79
78
  from ..core.py_util_helpers import ExceptionHandler
@@ -458,8 +457,10 @@ class Dataset:
458
457
  each bucket. Must contain len(bucket_boundaries)+1 elements.
459
458
  element_length_function (Callable, optional): A function that takes in
460
459
  M arguments where M = len(column_names) and returns an integer. If no value
461
- provided, parameter M the len(column_names) must be 1, and the size of the first
462
- dimension of that column will be taken as the length. Default: ``None``.
460
+ provided, parameter M the len(column_names) must be 1. At this time, the length of the data in this
461
+ column is determined based on its ndim. If ndim=0, the data length is 0, indicating a str, bool, int,
462
+ or float scalar; if it is an array with ndim > 0, the length of the data is array.shape[0].
463
+ Default: ``None`` , indicating this parameter is not specified.
463
464
  pad_info (dict, optional): The information about how to batch each column. The key
464
465
  corresponds to the column name, and the value must be a tuple of 2 elements.
465
466
  The first element corresponds to the shape to pad to, and the second
@@ -820,8 +821,7 @@ class Dataset:
820
821
  return dataset
821
822
 
822
823
  @check_map
823
- def map(self, operations, input_columns=None, output_columns=None, column_order=None,
824
- num_parallel_workers=None, **kwargs):
824
+ def map(self, operations, input_columns=None, output_columns=None, num_parallel_workers=None, **kwargs):
825
825
  """
826
826
  Apply each operation in operations to this dataset.
827
827
 
@@ -2752,8 +2752,6 @@ class BatchDataset(UnionBaseDataset):
2752
2752
 
2753
2753
  self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(), self.num_parallel_workers,
2754
2754
  str(self), [self.per_batch_map], self.max_rowsize)
2755
- # Wrap per_batch_map into _PythonCallable
2756
- self.per_batch_map = _PythonCallable(self.per_batch_map, 0, self.process_pool)
2757
2755
  else:
2758
2756
  if self.per_batch_map is not None:
2759
2757
  self.per_batch_map = FuncWrapper(self.per_batch_map)
@@ -3057,95 +3055,6 @@ _OP_NAME = dict()
3057
3055
  _OP_PROCESS = dict()
3058
3056
 
3059
3057
 
3060
- # PythonCallable wrapper for multiprocess pyfunc
3061
- class _PythonCallable:
3062
- """
3063
- Internal Python function wrapper for multiprocessing pyfunc.
3064
- """
3065
-
3066
- def __init__(self, py_callable, idx, pool=None):
3067
- # Original Python callable from user.
3068
- self.py_callable = py_callable
3069
- # Process pool created for current iterator.
3070
- self.pool = pool
3071
- # Python callable index
3072
- self.idx = idx
3073
-
3074
- def __call__(self, *args):
3075
- result = None
3076
- get_data_from_worker_process = False
3077
- while get_data_from_worker_process is False:
3078
- if self.pool.is_running() and check_iterator_cleanup() is False:
3079
- try:
3080
- result = self.pool.execute(self.idx, *args)
3081
- except multiprocessing.TimeoutError:
3082
- continue
3083
- get_data_from_worker_process = True
3084
- else:
3085
- # worker process is stopped
3086
- logger.info("The worker process of map operation is stopped. "
3087
- "So return None to main thread and break the main thread.")
3088
- return None
3089
- # got value from worker process
3090
- if not isinstance(result, tuple) and get_data_from_worker_process is True:
3091
- result = (result,)
3092
- return result
3093
-
3094
- def to_json(self):
3095
- return self.py_callable.to_json()
3096
-
3097
-
3098
- # used when python_multiprocessing=True in map
3099
- class Pipe:
3100
- """
3101
- Class to handle communication between the master process and the worker processes.
3102
- """
3103
-
3104
- def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
3105
- self.shared_memory = shared_memory
3106
- self.eof = multiprocessing.Event()
3107
- if self.shared_memory:
3108
- self.in_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[0])
3109
- self.res_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[1])
3110
- else:
3111
- self.in_queue = _Queue(1)
3112
- self.res_queue = _Queue(1)
3113
- self.in_queue.cancel_join_thread() # Ensure that the process does not hung when exiting
3114
-
3115
- def master_send(self, func_index, data):
3116
- self.in_queue.put_nowait((func_index, *data))
3117
-
3118
- def master_receive(self):
3119
- if self.eof is None:
3120
- raise RuntimeError("EOF is none when get data from worker.")
3121
- if self.eof.is_set():
3122
- return None
3123
- return self.res_queue.get(timeout=1)
3124
-
3125
- def master_close(self):
3126
- self.eof.set()
3127
- self.send_finish_signal_to_worker()
3128
- self.send_finish_signal()
3129
-
3130
- def send_finish_signal(self):
3131
- self.worker_send(None)
3132
-
3133
- def send_finish_signal_to_worker(self):
3134
- self.master_send(0, "QUIT")
3135
-
3136
- def worker_send(self, data):
3137
- self.res_queue.put_until(data, timeout=1, exit_signal=self.eof)
3138
-
3139
- def worker_receive(self):
3140
- result = self.in_queue.get_until(timeout=1, exit_signal=self.eof)
3141
- if result is None:
3142
- return result
3143
- if len(result) == 1:
3144
- raise RuntimeError(f"Corrupted data. Worker received {len(result)} elements, it should be more than 1.")
3145
- func_index, *data = result
3146
- return func_index, tuple(data)
3147
-
3148
-
3149
3058
  def _main_process_already_exit():
3150
3059
  """
3151
3060
  Judge whether main process already exit.
@@ -3158,15 +3067,21 @@ def _main_process_already_exit():
3158
3067
  return False
3159
3068
 
3160
3069
 
3161
- def _worker_loop(operations, pipe, worker_id):
3070
+ def _worker_loop(quit_signal, operations, worker_id, op_type, key, video_backend=None):
3162
3071
  """
3163
3072
  Multiprocess worker process loop.
3073
+ The worker process(Python Layer) gets data from / sends data to map / batch thread(C++ layer) by message queue
3074
+ and shared memory. This logic no longer uses the Python multi-process pool, in_queue, and out_queue for
3075
+ data transferring.
3164
3076
  """
3077
+ # Release the lock which had been holded in map_op.cc::Launch()/batch_op.cc::Launch()
3078
+ cde.unlock_shm_id_and_msg_id_mutex()
3079
+
3165
3080
  # Initialize C++ side signal handlers
3166
3081
  cde.register_worker_handlers()
3167
3082
 
3168
- # Ensure that the process does not hang when exiting
3169
- pipe.res_queue.cancel_join_thread()
3083
+ if video_backend is not None:
3084
+ set_video_backend(video_backend)
3170
3085
 
3171
3086
  def _ignore_sigint():
3172
3087
  """
@@ -3180,121 +3095,197 @@ def _worker_loop(operations, pipe, worker_id):
3180
3095
  if get_seed() != 5489:
3181
3096
  set_seed(get_seed() + worker_id)
3182
3097
 
3098
+ msg_queue = cde.MessageQueue(key)
3099
+ msg_queue.set_release_flag(False)
3100
+ shm_queue = cde.SharedMemoryQueue(key)
3101
+ shm_queue.set_release_flag(False)
3102
+
3103
+ pid = str(os.getpid())
3104
+ ppid = str(os.getppid())
3105
+
3106
+ # Scenario: when the main process is killed, worker processe needs to release shm & msg.
3107
+ # The shm id and msg id should be released by SIGTERM in worker handler
3108
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3109
+ msg_queue.msg_queue_id)
3110
+
3111
+ num_receive = 0
3112
+ num_send = 0
3183
3113
  while not _main_process_already_exit():
3184
3114
  _ignore_sigint()
3185
3115
 
3186
- result = pipe.worker_receive()
3187
- if result is None:
3116
+ # quit by close_worker
3117
+ if quit_signal.is_set():
3188
3118
  return
3189
- (idx, input_tensors) = result
3190
- if input_tensors == "QUIT":
3191
- break
3119
+
3120
+ # >> receive procedure >>
3121
+ ## 1. get message queue which contains shared memory info from map C++ thread in main process
3192
3122
  try:
3193
- output_tensors = operations[idx](*input_tensors)
3123
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3124
+ msg_queue.msg_queue_id)
3125
+ msg_queue.msg_rcv(cde.MASTER_SEND_DATA_MSG)
3126
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3127
+ msg_queue.msg_queue_id)
3128
+ except RuntimeError as err:
3129
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3130
+ msg_queue.msg_queue_id)
3131
+ # the msg_queue had been released by main process, ignore it in worker process
3132
+ if "errno: 2" in str(err):
3133
+ # Because the worker process does not release msg and shm, continue
3134
+ continue
3135
+ raise err
3194
3136
 
3195
- pipe.worker_send(output_tensors)
3196
- except Exception:
3197
- pipe.worker_send(ExceptionHandler(where="in map(or batch) worker and execute Python function"))
3198
- # Do not return
3137
+ ## when the message queue had been released, break the loop
3138
+ if msg_queue.message_queue_state() == cde.MessageState.RELEASED:
3139
+ logger.info("The message queue had been released, worker loop end.")
3140
+ break
3199
3141
 
3200
- # release the queue when stop the worker by master
3201
- del pipe.in_queue
3202
- del pipe.res_queue
3142
+ num_receive += 1
3203
3143
 
3144
+ logger.info("Python process {} worker({}) receives {} samples from map thread.".format(op_type, worker_id,
3145
+ num_receive))
3204
3146
 
3205
- def worker_target(operations, worker_id):
3206
- logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
3207
- return lambda pipe: _worker_loop(operations, pipe, worker_id)
3147
+ # convert the data from shm to python data
3148
+ if op_type == cde.MAP_OP:
3149
+ ## 2. construct shared memory to TensorRow which contains one / more columns
3150
+ tensor_row = shm_queue.to_tensor_row(msg_queue.shm_id, msg_queue.shm_size)
3208
3151
 
3152
+ ## 3. convert TensorRow to Python tuple which elements are a column
3153
+ tuple_column = cde.convert_tensor_row_to_py_tuple(tensor_row)
3209
3154
 
3210
- class WorkerTarget:
3211
- def __init__(self, operations, pipe, worker_id):
3212
- self.operations = operations
3213
- self.pipe = pipe
3214
- self.worker_id = worker_id
3215
- logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
3155
+ py_func_input = tuple_column
3156
+ elif op_type == cde.BATCH_OP:
3157
+ ## 2. construct shard memory to TensorTable which contains one / more TensorRow & CBatchInfo
3158
+ tensor_table, batch_info, _ = shm_queue.to_tensor_table(msg_queue.shm_id, msg_queue.shm_size)
3216
3159
 
3217
- def __call__(self):
3218
- return _worker_loop(self.operations, self.pipe, self.worker_id)
3160
+ ## 3. convert TensorTable to Python tuple tuple
3161
+ # The tuple indicate the multi columns
3162
+ # The list indicate the multi rows
3163
+ tuple_list_column = cde.convert_tensor_table_to_py_tuple_list(tensor_table)
3219
3164
 
3165
+ py_func_input = (*tuple_list_column, batch_info)
3166
+ else:
3167
+ raise RuntimeError("The op_type: {} is invalid.".format(op_type))
3220
3168
 
3221
- class _MPWorker(multiprocessing.Process):
3222
- """
3223
- Worker process for multiprocessing.
3224
- """
3169
+ # execute the pyfunc
3170
+ try:
3171
+ py_func_output = py_func_input
3225
3172
 
3226
- def __init__(self, operations, warning_ctl, max_rowsize=(-1, -1), worker_id=0):
3227
- shared_memory = get_enable_shared_mem()
3228
- self.pipe = Pipe(warning_ctl, shared_memory=shared_memory, max_rowsize=max_rowsize)
3229
- self.check_interval = get_multiprocessing_timeout_interval()
3230
- super().__init__(target=worker_target(operations, worker_id), name="MapWorker" + str(worker_id),
3231
- args=(self.pipe,), daemon=True)
3232
-
3233
- def execute(self, idx, *args):
3234
- """Acquiring data from a worker in an infinite loop"""
3235
- self.pipe.master_send(idx, args)
3236
- time_s = time.time()
3237
- wait_count = 1
3238
- while True:
3239
- cost_time = time.time() - time_s
3240
- if cost_time / self.check_interval >= wait_count:
3241
- wait_count += 1
3242
- logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
3243
- "worker of the map operation is hanging. "
3244
- "Check whether the user defined data transform is too slow or the "
3245
- "output data is too large. You can also set the timeout interval by "
3246
- "ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
3247
- "of this log.")
3248
- pid = self.pid
3249
- logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
3250
- install_status, _ = subprocess.getstatusoutput("py-spy --version")
3251
- if install_status == 0:
3252
- stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
3253
- logger.warning("Map worker subprocess stack:\n{}".format(stack))
3173
+ # execute the remaining operations
3174
+ for idx in range(len(operations)):
3175
+ if isinstance(py_func_output, tuple):
3176
+ py_func_output = operations[idx](*py_func_output)
3254
3177
  else:
3255
- logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
3178
+ py_func_output = operations[idx](py_func_output)
3179
+
3180
+ # << send procedure <<
3181
+ # the result is None
3182
+ if py_func_output is None:
3183
+ raise RuntimeError("Got None from Python Function which is defined by {}".format(op_type))
3184
+
3185
+ # convert the output to tuple
3186
+ if not isinstance(py_func_output, tuple):
3187
+ py_func_output = (py_func_output,)
3188
+
3189
+ if op_type == cde.MAP_OP:
3190
+ # check if the map return Generator type
3191
+ for item in py_func_output:
3192
+ if isinstance(item, GeneratorType):
3193
+ raise RuntimeError("Cannot pickle <class 'generator'> object, please verify pyfunc "
3194
+ "return with numpy array")
3195
+
3196
+ ## 1. convert Python tuple to TensorRow
3197
+ output_tensor_row = cde.convert_py_tuple_to_tensor_row(py_func_output)
3198
+
3199
+ ## 2. convert TensorRow to shared memory
3200
+ shm_queue.from_tensor_row(output_tensor_row)
3201
+ elif op_type == cde.BATCH_OP:
3202
+ ## 1. convert Python tuple tuple to TensorTable
3203
+ output_tensor_table, concat_batch = cde.convert_py_tuple_list_to_tensor_table(py_func_output)
3204
+
3205
+ ## 2. convert TensorTable to shared memory
3206
+ shm_queue.from_tensor_table(output_tensor_table, batch_info, concat_batch)
3207
+ else:
3208
+ raise RuntimeError("The op_type: {} is invalid.".format(op_type))
3209
+
3210
+ ## 3. send message queue which contains shared memory to map C++ thread in main process
3211
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3212
+ msg_queue.msg_queue_id)
3213
+ msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
3214
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3215
+ msg_queue.msg_queue_id)
3216
+
3217
+ num_send += 1
3218
+ logger.info("Python process {} worker({}) sends {} samples to map thread.".format(op_type, worker_id,
3219
+ num_send))
3220
+ except Exception:
3256
3221
  try:
3257
- res = self.pipe.master_receive()
3258
- except queue.Empty:
3259
- continue
3260
- if res is None:
3261
- # receive finish signal
3262
- return None
3263
- if isinstance(res, ExceptionHandler):
3264
- res.reraise()
3265
- return res
3266
-
3267
- def close(self):
3268
- try:
3269
- if self.is_alive():
3270
- # release the eager executor which is used by current process
3271
- transforms.transforms.clean_unused_executors()
3272
-
3273
- logger.info(f"Closing worker with PID: {self.pid}")
3274
- self.pipe.master_close()
3275
-
3276
- process_dir = os.path.join('/proc', str(self.pid))
3277
- while self.is_alive() and os.path.exists(process_dir):
3278
- logger.info("Waiting for worker {} closed ...".format(self.pid))
3279
- time.sleep(0.001)
3280
-
3281
- # del the handle which hold by master
3282
- del self.pipe.in_queue
3283
- del self.pipe.res_queue
3284
- super().terminate()
3285
- super().join()
3286
- super().close()
3287
-
3288
- except ValueError:
3289
- # Process has been closed already
3222
+ if op_type == cde.MAP_OP:
3223
+ pyfunc_err = ExceptionHandler(where="in map worker and execute Python function")
3224
+ elif op_type == cde.BATCH_OP:
3225
+ pyfunc_err = ExceptionHandler(where="in batch(per_batch_map) worker and execute Python function")
3226
+ else:
3227
+ pyfunc_err = "The op_type: {} is invalid.".format(op_type)
3228
+ pyfunc_err.reraise()
3229
+ except Exception as err:
3230
+ _, _, exc_tb = sys.exc_info()
3231
+ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
3232
+
3233
+ if op_type == cde.MAP_OP:
3234
+ logger.info("Got exception {} from Map Worker({})".format(str(err), worker_id))
3235
+ elif op_type == cde.BATCH_OP:
3236
+ logger.info("Got exception {} from Batch Worker({})".format(str(err), worker_id))
3237
+ else:
3238
+ logger.info("The op_type: {} is invalid.".format(op_type))
3239
+
3240
+ # err_code, lineno, filename, err_desc
3241
+ msg_queue.serialize_status(cde.StatusCode.MD_PY_FUNC_EXCEPTION, exc_tb.tb_lineno, fname, str(err))
3242
+
3243
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3244
+ msg_queue.msg_queue_id)
3245
+ msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
3246
+ cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3247
+ msg_queue.msg_queue_id)
3248
+
3249
+ # worker error
3250
+ if get_error_samples_mode() == ErrorSamplesMode.RETURN:
3251
+ break
3252
+ else:
3253
+ # continue the loop, when the get_error_samples_mode() is REPLACE or SKIP
3254
+ continue
3255
+
3256
+ # release the eager executor which is used by current process
3257
+ transforms.transforms.clean_unused_executors()
3258
+
3259
+ while not _main_process_already_exit():
3260
+ # quit by close_worker
3261
+ if quit_signal.is_set():
3290
3262
  return
3291
- return
3292
3263
 
3293
- def is_alive(self):
3294
- try:
3295
- return super().is_alive()
3296
- except ValueError:
3297
- return False
3264
+ logger.info("The worker process is waiting for the main process to exit.")
3265
+ time.sleep(0.1)
3266
+
3267
+ # the main process is not exist yet which maybe killed -9
3268
+ msg_queue.set_release_flag(True)
3269
+ msg_queue.release()
3270
+ shm_queue.set_release_flag(True)
3271
+ shm_queue.release()
3272
+
3273
+
3274
+ class WorkerTarget:
3275
+ """Mulitprocess mode for dataset map or batch"""
3276
+ def __init__(self, quit_signal, operations, worker_id, op_type, ftok_key):
3277
+ self.quit_signal = quit_signal
3278
+ self.operations = operations
3279
+ self.worker_id = worker_id
3280
+ self.op_type = op_type
3281
+ self.ftok_key = ftok_key
3282
+ start_method = multiprocessing.get_start_method()
3283
+ logger.info("Multiprocessing start method: {}".format(start_method))
3284
+ self.video_backend = get_video_backend() if start_method == 'spawn' else None
3285
+
3286
+ def __call__(self):
3287
+ return _worker_loop(self.quit_signal, self.operations, self.worker_id, self.op_type, self.ftok_key,
3288
+ self.video_backend)
3298
3289
 
3299
3290
 
3300
3291
  def worker_is_alive(worker):
@@ -3305,24 +3296,31 @@ def worker_is_alive(worker):
3305
3296
  return False
3306
3297
 
3307
3298
 
3308
- def close_worker(worker, pipe):
3299
+ def close_worker(worker, eof):
3309
3300
  """Close the subprocess worker in spawn mode"""
3310
3301
  try:
3311
3302
  if worker_is_alive(worker):
3312
3303
  # release the eager executor which is used by current process
3313
3304
  transforms.transforms.clean_unused_executors()
3314
3305
 
3315
- logger.info(f"Closing worker with PID: {worker.pid}")
3316
- pipe.master_close()
3306
+ # let the worker exit
3307
+ logger.info("Set eof flag for worker with PID: {}.".format(worker.pid))
3308
+ eof.set()
3309
+
3310
+ # wait timeout
3311
+ wait_timeout = 2
3312
+ start_time = time.time()
3317
3313
 
3318
3314
  process_dir = os.path.join('/proc', str(worker.pid))
3319
3315
  while worker_is_alive(worker) and os.path.exists(process_dir):
3320
3316
  logger.info("Waiting for worker {} closed ...".format(worker.pid))
3321
3317
  time.sleep(0.5)
3322
3318
 
3319
+ # maybe the worker is hung by msg_queue.MsgRcv, so break the loop and terminate it in next step
3320
+ if time.time() - start_time > wait_timeout:
3321
+ break
3322
+
3323
3323
  # del the handle which hold by master
3324
- del pipe.in_queue
3325
- del pipe.res_queue
3326
3324
  worker.terminate()
3327
3325
  worker.join()
3328
3326
  worker.close()
@@ -3379,7 +3377,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3379
3377
  self.warning_ctl = None
3380
3378
  # cache thread (get_ident()) to worker_id mapping in Python layer
3381
3379
  self.python_threads_to_workers = {}
3382
- self.eof = None
3380
+ self.eof_workers = []
3381
+ self.eof_clean_process = None
3383
3382
  self.running = False
3384
3383
 
3385
3384
  def __del__(self):
@@ -3455,19 +3454,39 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3455
3454
  del workers
3456
3455
  os.kill(os.getpid(), signal.SIGTERM)
3457
3456
 
3458
- def launch(self, op_id=-1):
3457
+ def launch(self, op_id, op_type, ftok_keys):
3459
3458
  """
3460
3459
  Launch Python multiprocessing pool.
3461
3460
 
3462
3461
  Args:
3463
- op_id: ID for operation to have Python multiprocessing pool launched
3462
+ op_id (int): ID for operation to have Python multiprocessing pool launched
3463
+ op_type (str): Indicate MapOp / BatchOp
3464
+ ftok_keys (list[int]): the ftok key of list for msg queue and shm queue
3464
3465
 
3465
3466
  Returns:
3466
3467
  Python multiprocessing pool is launched.
3467
3468
  """
3468
3469
  self.python_threads_to_workers = {}
3470
+
3471
+ if not isinstance(op_id, int):
3472
+ raise RuntimeError("The op_id is not int.")
3469
3473
  self.op_id = op_id
3470
- logger.info("Launching new Python multiprocessing pool for Op: " + str(self.op_id))
3474
+
3475
+ valid_op_type = [cde.MAP_OP, cde.BATCH_OP]
3476
+ if op_type not in valid_op_type:
3477
+ raise RuntimeError("The op_type: {} is not in {}.".format(op_type, valid_op_type))
3478
+ self.op_type = op_type
3479
+
3480
+ if not isinstance(ftok_keys, list):
3481
+ raise RuntimeError("The ftok_keys is not a list.")
3482
+ if not all(isinstance(x, int) for x in ftok_keys):
3483
+ raise RuntimeError("The item in ftok_keys is not all int.")
3484
+ if len(ftok_keys) != self.num_parallel_workers:
3485
+ raise RuntimeError("The len of ftok_keys is not equal to num_parallel_workers.")
3486
+ self.ftok_keys = ftok_keys
3487
+
3488
+ logger.info("Launching new Python multiprocessing pool for Op: " + self.op_type + "(" + str(self.op_id) + \
3489
+ "), ftok_keys: " + str(self.ftok_keys))
3471
3490
  if self.is_mp_enabled():
3472
3491
  message = "Launching a new Python multiprocessing pool while a pool already exists!" + \
3473
3492
  " The existing pool will be terminated first."
@@ -3490,30 +3509,21 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3490
3509
  raise Exception("Pool was already created, close it first.")
3491
3510
 
3492
3511
  self.workers = []
3493
- self.pipes = []
3494
- self.check_interval = get_multiprocessing_timeout_interval()
3495
3512
  self.warning_ctl = multiprocessing.Value('i', 0)
3496
- if self.start_method == "fork":
3497
- # Construct python worker processes
3498
- for worker_id in range(self.num_parallel_workers):
3499
- worker = _MPWorker(self.operations, self.warning_ctl, self.max_rowsize, worker_id)
3500
- worker.start()
3501
- self.workers.append(worker)
3502
- else:
3503
- multiprocessing.set_start_method(self.start_method, True)
3504
3513
 
3505
- # Construct python worker processes
3506
- for worker_id in range(self.num_parallel_workers):
3507
- shared_memory = get_enable_shared_mem()
3508
- pipe = Pipe(self.warning_ctl, shared_memory=shared_memory, max_rowsize=self.max_rowsize)
3509
- self.check_interval = get_multiprocessing_timeout_interval()
3510
- worker = multiprocessing.Process(target=WorkerTarget(self.operations, pipe, worker_id),
3511
- name="MapWorker" + str(worker_id), daemon=True)
3512
- self.workers.append(worker)
3513
- self.pipes.append(pipe)
3514
- worker.start()
3514
+ multiprocessing.set_start_method(self.start_method, True)
3515
+
3516
+ # Construct python worker processes
3517
+ for worker_id in range(self.num_parallel_workers):
3518
+ eof = multiprocessing.Event()
3519
+ worker = multiprocessing.Process(target=WorkerTarget(eof, self.operations, worker_id, self.op_type,
3520
+ self.ftok_keys[worker_id]),
3521
+ name="MapWorker" + str(worker_id), daemon=True)
3522
+ self.eof_workers.append(eof)
3523
+ self.workers.append(worker)
3524
+ worker.start()
3515
3525
 
3516
- multiprocessing.set_start_method("fork", True)
3526
+ multiprocessing.set_start_method("fork", True)
3517
3527
 
3518
3528
  logger.info("Launch worker process(es): {}".format(self.get_pids()))
3519
3529
 
@@ -3527,6 +3537,20 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3527
3537
  atexit.register(lambda cleanup: cleanup()() if cleanup() is not None else None,
3528
3538
  weakref.WeakMethod(self.terminate))
3529
3539
 
3540
+ # Ensure that all workers are in the running state
3541
+ start = time.time()
3542
+ wait_time = 120 # 120s
3543
+ while True:
3544
+ if self.is_running():
3545
+ logger.info("All workers has been running state.")
3546
+ break
3547
+ else:
3548
+ time.sleep(0.5)
3549
+ if time.time() - start > wait_time:
3550
+ logger.error("All worker processes have not reached the running state within " + str(wait_time) +
3551
+ " seconds, data processing errors may occur.")
3552
+ break
3553
+
3530
3554
  def terminate(self):
3531
3555
  if self.running:
3532
3556
  # abort the monitor first and then close all the workers
@@ -3555,7 +3579,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3555
3579
  continue
3556
3580
  return self.pids
3557
3581
 
3558
- def add_new_workers(self, num_new_workers):
3582
+ def add_new_workers(self, num_new_workers, op_type, ftok_keys):
3583
+ """Used by AutoTune"""
3559
3584
  logger.info(
3560
3585
  "Increasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
3561
3586
  ", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
@@ -3563,9 +3588,14 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3563
3588
  num_new_workers) + ".")
3564
3589
  self.terminate()
3565
3590
  self.num_parallel_workers += num_new_workers
3566
- self.launch(self.op_id)
3567
3591
 
3568
- def remove_workers(self, num_removed_workers):
3592
+ if self.num_parallel_workers != len(ftok_keys):
3593
+ raise RuntimeError("Add new workers failed, the num_workers is not equal size of ftok_keys.")
3594
+
3595
+ self.launch(self.op_id, op_type, ftok_keys)
3596
+
3597
+ def remove_workers(self, num_removed_workers, op_type, ftok_keys):
3598
+ """Used by AutoTune"""
3569
3599
  logger.info(
3570
3600
  "Decreasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
3571
3601
  ", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
@@ -3573,59 +3603,14 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3573
3603
  num_removed_workers) + ".")
3574
3604
  self.terminate()
3575
3605
  self.num_parallel_workers -= num_removed_workers
3576
- self.launch(self.op_id)
3577
3606
 
3578
- def is_mp_enabled(self):
3579
- return self.workers is not None
3607
+ if self.num_parallel_workers != len(ftok_keys):
3608
+ raise RuntimeError("Remove workers failed, the num_workers is not equal size of ftok_keys.")
3580
3609
 
3581
- def execute(self, idx, *args):
3582
- """
3583
- Execute
3584
- """
3585
- t_id = threading.get_ident()
3586
- # get the worker_id from Python layer cache first, get from Cpp layer if not found.
3587
- worker_id = self.python_threads_to_workers.setdefault(t_id, self.get_thread_to_worker())
3588
- if worker_id >= len(self.workers):
3589
- raise RuntimeError("[Internal] worker_id value is greater than number of available workers!")
3590
-
3591
- # todo check_iterator_cleanup
3592
- if self.is_running() and check_iterator_cleanup() is False:
3593
- if self.start_method == "fork":
3594
- return self.workers[worker_id].execute(idx, *args)
3595
- # spawn mode
3596
- self.pipes[worker_id].master_send(idx, args)
3597
- time_s = time.time()
3598
- wait_count = 1
3599
- while True:
3600
- cost_time = time.time() - time_s
3601
- if cost_time / self.check_interval >= wait_count:
3602
- wait_count += 1
3603
- logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
3604
- "worker of the map operation is hanging. "
3605
- "Check whether the user defined data transform is too slow or the "
3606
- "output data is too large. You can also set the timeout interval by "
3607
- "ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
3608
- "of this log.")
3609
- pid = self.workers[worker_id].pid
3610
- logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
3611
- install_status, _ = subprocess.getstatusoutput("py-spy --version")
3612
- if install_status == 0:
3613
- stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
3614
- logger.warning("Map worker subprocess stack:\n{}".format(stack))
3615
- else:
3616
- logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
3617
- try:
3618
- res = self.pipes[worker_id].master_receive()
3619
- except queue.Empty:
3620
- continue
3621
- if res is None:
3622
- # receive finish signal
3623
- return None
3624
- if isinstance(res, ExceptionHandler):
3625
- res.reraise()
3626
- return res
3610
+ self.launch(self.op_id, op_type, ftok_keys)
3627
3611
 
3628
- return None
3612
+ def is_mp_enabled(self):
3613
+ return self.workers is not None
3629
3614
 
3630
3615
  def _launch_monitor(self):
3631
3616
  """
@@ -3634,10 +3619,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3634
3619
  The watch dog will clean up subprocesses and main process when any subprocess exited.
3635
3620
  """
3636
3621
  if platform.system().lower() != 'windows':
3637
- self.eof = multiprocessing.Event()
3622
+ self.eof_clean_process = multiprocessing.Event()
3638
3623
  self.cleaning_process = multiprocessing.Process(target=self._clean_process,
3639
3624
  name="MapCleanProcess",
3640
- args=(self.ppid, self.workers, self.eof),
3625
+ args=(self.ppid, self.workers, self.eof_clean_process),
3641
3626
  daemon=True)
3642
3627
  self.cleaning_process.start()
3643
3628
  logger.info("Launch clean process {} to monitor worker "
@@ -3653,8 +3638,9 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3653
3638
  """Deregister workers monitored by the watch dog and join clean process."""
3654
3639
  if get_enable_watchdog():
3655
3640
  cde.deregister_worker_pids(id(self))
3656
- if hasattr(self, 'eof') and self.eof is not None:
3657
- self.eof.set()
3641
+ if hasattr(self, 'eof') and self.eof_clean_process is not None:
3642
+ logger.info("Set eof flag for cleaning_process.")
3643
+ self.eof_clean_process.set()
3658
3644
  if hasattr(self, 'cleaning_process') and self.cleaning_process is not None:
3659
3645
  # let the quit event notify the cleaning process to exit
3660
3646
  self.cleaning_process.join(timeout=5)
@@ -3665,20 +3651,14 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3665
3651
 
3666
3652
  def is_running(self):
3667
3653
  if hasattr(self, 'workers') and self.workers is not None:
3668
- if self.start_method == "fork":
3669
- return all([w.is_alive() for w in self.workers])
3670
3654
  return all([worker_is_alive(w) for w in self.workers])
3671
3655
  return False
3672
3656
 
3673
3657
  def close_all_workers(self):
3674
3658
  """Close all the subprocess workers"""
3675
3659
  if hasattr(self, 'workers') and self.workers is not None:
3676
- if self.start_method == "fork":
3677
- for w in self.workers:
3678
- w.close()
3679
- else:
3680
- for i, w in enumerate(self.workers):
3681
- close_worker(w, self.pipes[i])
3660
+ for index in range(len(self.workers)):
3661
+ close_worker(self.workers[index], self.eof_workers[index])
3682
3662
 
3683
3663
  check_interval = get_multiprocessing_timeout_interval()
3684
3664
  for w in self.workers:
@@ -3695,12 +3675,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3695
3675
  continue
3696
3676
  raise e
3697
3677
  try:
3698
- if self.start_method == "fork":
3699
- if w.is_alive():
3700
- os.close(subprocess_file_descriptor)
3701
- else:
3702
- if worker_is_alive(w):
3703
- os.close(subprocess_file_descriptor)
3678
+ if worker_is_alive(w):
3679
+ os.close(subprocess_file_descriptor)
3704
3680
  except OSError as e:
3705
3681
  # Maybe the file descriptor had been released, so ignore the 'Bad file descriptor'
3706
3682
  if "Bad file descriptor" not in str(e):
@@ -3709,8 +3685,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3709
3685
  # use clear to release the handle which is better than self.workers = None
3710
3686
  self.workers.clear()
3711
3687
  self.workers = None
3712
- self.pipes.clear()
3713
- self.pipes = None
3688
+ self.eof_workers.clear()
3689
+ self.eof_workers = []
3690
+
3691
+ # as it can cause the main process to not exit when PyFunc executes very slowly so release
3692
+ # the shm & msg here
3693
+ cde.release_shm_and_msg_by_worker_pids(self.pids)
3714
3694
  self.pids = None
3715
3695
 
3716
3696
 
@@ -3788,7 +3768,22 @@ class MapDataset(UnionBaseDataset):
3788
3768
 
3789
3769
  count_old_transforms, count_new_transforms, count_non_data_vision_transforms = \
3790
3770
  self.__count_transforms(operations)
3771
+ count_py_ops = self.__count_py_ops(operations)
3791
3772
  count_pyfunc = self.__count_pyfuncs(operations)
3773
+
3774
+ # Whether to execute ops in the thread mode
3775
+ # op_type python_multiprocessing run_in_thread
3776
+ # c_op(s) false yes
3777
+ # c_op(s) true yes
3778
+ # py_op(s) / PyFunc false yes
3779
+ # py_op(s) / PyFunc true no
3780
+ # c_op(s) + py_op(s) / PyFunc false yes
3781
+ # c_op(s) + py_op(s) / PyFunc true no
3782
+ run_in_thread = not self.python_multiprocessing or (count_pyfunc == 0 and count_py_ops == 0) or get_debug_mode()
3783
+
3784
+ if self.python_multiprocessing and platform.system().lower() == 'windows':
3785
+ run_in_thread = True
3786
+
3792
3787
  if count_new_transforms + count_pyfunc == len(operations):
3793
3788
  prev_op = None
3794
3789
  for op in operations:
@@ -3806,18 +3801,43 @@ class MapDataset(UnionBaseDataset):
3806
3801
  op.implementation = Implementation.C
3807
3802
  prev_op = op
3808
3803
  operations = self.__insert_debug_wrapper(operations)
3809
- operations = transforms.transforms.Compose.reduce(operations)
3804
+ if run_in_thread:
3805
+ operations = transforms.transforms.Compose.reduce(operations)
3810
3806
  elif count_old_transforms + count_pyfunc + count_non_data_vision_transforms == len(operations):
3811
3807
  operations = self.__insert_debug_wrapper(operations)
3812
- operations = transforms.py_transforms.Compose.reduce(operations)
3808
+ if run_in_thread:
3809
+ operations = transforms.py_transforms.Compose.reduce(operations)
3813
3810
  else:
3814
3811
  raise RuntimeError("Mixing old legacy c/py_transforms and new unified transforms is not allowed.")
3815
3812
 
3816
- self.operations = self.__process_final_operations(operations)
3813
+ if run_in_thread:
3814
+ self.operations = self.__process_final_operations(operations)
3815
+ else:
3816
+ self.operations = operations
3817
3817
  self.prepare_multiprocessing()
3818
3818
 
3819
3819
  callbacks = [cb.create_runtime_obj() for cb in self.callbacks]
3820
- return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
3820
+
3821
+ ## thread mode
3822
+ if run_in_thread:
3823
+ return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
3824
+ callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
3825
+
3826
+ # Bind self.operations with self.process_pool
3827
+ class _BindProcessPoolWithOperations:
3828
+ def __init__(self, pool, operations):
3829
+ self.pool = pool
3830
+ self.operations = operations
3831
+
3832
+ def __call__(self):
3833
+ pass
3834
+
3835
+ self.bound = _BindProcessPoolWithOperations(self.process_pool, self.operations)
3836
+
3837
+ ## process mode
3838
+ # in multi process mode, we just transfer the self.bound which is not really used in c layer
3839
+ # because when the pipeline is running, map thread transfer data through c++ shm & msg to Python Worker Process
3840
+ return cde.MapNode(children[0], [self.bound], self.input_columns, self.output_columns,
3821
3841
  callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
3822
3842
 
3823
3843
  def __deepcopy__(self, memodict):
@@ -3870,10 +3890,22 @@ class MapDataset(UnionBaseDataset):
3870
3890
  @staticmethod
3871
3891
  def __count_pyfuncs(operations):
3872
3892
  """
3873
- Count the number of pyfuncs operations
3893
+ Count the number of pyfuncs operations which is defined by user
3874
3894
  """
3875
3895
  return sum([1 if isinstance(op, FuncWrapper) else 0 for op in operations])
3876
3896
 
3897
+ @staticmethod
3898
+ def __count_py_ops(operations):
3899
+ """
3900
+ Count the number of python operations which is built-in
3901
+ """
3902
+ count = 0
3903
+ for op in operations:
3904
+ if hasattr(op, "implementation") and op.implementation != Implementation.C \
3905
+ and op.implementation is not None:
3906
+ count += 1
3907
+ return count
3908
+
3877
3909
  @staticmethod
3878
3910
  def __count_transforms(operations):
3879
3911
  """
@@ -3937,7 +3969,6 @@ class MapDataset(UnionBaseDataset):
3937
3969
  " Ignoring Python multiprocessing for map operation.")
3938
3970
  return
3939
3971
  if self.python_multiprocessing:
3940
- iter_specific_operations = []
3941
3972
  callable_list = []
3942
3973
 
3943
3974
  # If user didn't specify num_parallel_workers, set it to default
@@ -3954,18 +3985,6 @@ class MapDataset(UnionBaseDataset):
3954
3985
  self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(),
3955
3986
  self.num_parallel_workers, str(self),
3956
3987
  callable_list, self.max_rowsize)
3957
- # Pass #2
3958
- idx = 0
3959
- for op in self.operations:
3960
- # our c transforms is now callable and should not be run in Python multithreading
3961
- if MapDataset.__operation_valid_for_multiprocessing(op):
3962
- # Wrap Python callable into _PythonCallable
3963
- iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))
3964
- idx += 1
3965
- else:
3966
- # CPP ops remain the same
3967
- iter_specific_operations.append(op)
3968
- self.operations = iter_specific_operations
3969
3988
 
3970
3989
  def __insert_debug_wrapper(self, operations):
3971
3990
  """