mindspore 2.6.0__cp310-cp310-win_amd64.whl → 2.7.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +2 -2
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +42 -11
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
- mindspore/_extends/optimize/cell_utils.py +96 -0
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +1109 -0
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/compile_config.py +44 -22
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -2
- mindspore/_extends/parse/parser.py +64 -83
- mindspore/_extends/parse/resources.py +39 -0
- mindspore/_extends/parse/standard_method.py +47 -14
- mindspore/_extends/parse/trope.py +8 -1
- mindspore/_extends/pijit/__init__.py +1 -2
- mindspore/_extends/pijit/pijit_func_white_list.py +2 -5
- mindspore/amp.py +4 -22
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/adasum.py +1 -1
- mindspore/boost/boost_cell_wrapper.py +4 -4
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +43 -12
- mindspore/common/_grad_function.py +2 -1
- mindspore/common/_pijit_context.py +28 -7
- mindspore/common/_stub_tensor.py +1 -209
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +177 -52
- mindspore/common/_utils.py +9 -1
- mindspore/common/api.py +338 -208
- mindspore/common/dtype.py +108 -57
- mindspore/common/dump.py +11 -16
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/{auto_dynamic_shape.py → dynamic_shape/auto_dynamic_shape.py} +17 -23
- mindspore/common/dynamic_shape/enable_dynamic.py +197 -0
- mindspore/common/file_system.py +59 -9
- mindspore/common/generator.py +2 -3
- mindspore/common/hook_handle.py +33 -5
- mindspore/common/jit_config.py +1 -1
- mindspore/common/jit_trace.py +84 -105
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +27 -29
- mindspore/common/recompute.py +5 -7
- mindspore/common/sparse_tensor.py +0 -3
- mindspore/common/symbol.py +0 -1
- mindspore/common/tensor.py +84 -133
- mindspore/communication/_comm_helper.py +46 -4
- mindspore/communication/management.py +79 -7
- mindspore/context.py +47 -38
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +38 -4
- mindspore/dataset/engine/datasets.py +350 -322
- mindspore/dataset/engine/datasets_user_defined.py +69 -23
- mindspore/dataset/engine/iterators.py +2 -2
- mindspore/dataset/engine/obs/config_loader.py +2 -2
- mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/py_transforms.py +7 -3
- mindspore/dataset/transforms/transforms.py +10 -6
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +17 -5
- mindspore/dataset/vision/utils.py +632 -21
- mindspore/dataset/vision/validators.py +1 -0
- mindspore/device_context/ascend/device.py +1 -1
- mindspore/device_context/ascend/op_tuning.py +35 -1
- mindspore/device_context/gpu/__init__.py +2 -2
- mindspore/device_context/gpu/device.py +1 -1
- mindspore/device_context/gpu/op_precision.py +4 -2
- mindspore/device_context/gpu/op_tuning.py +6 -3
- mindspore/device_manager.py +16 -9
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +5 -4
- mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
- mindspore/experimental/optim/adadelta.py +13 -20
- mindspore/experimental/optim/adagrad.py +15 -22
- mindspore/experimental/optim/adam.py +17 -24
- mindspore/experimental/optim/adamax.py +14 -22
- mindspore/experimental/optim/adamw.py +28 -34
- mindspore/experimental/optim/asgd.py +15 -25
- mindspore/experimental/optim/lr_scheduler.py +27 -45
- mindspore/experimental/optim/nadam.py +14 -24
- mindspore/experimental/optim/optimizer.py +13 -23
- mindspore/experimental/optim/radam.py +18 -24
- mindspore/experimental/optim/rmsprop.py +14 -25
- mindspore/experimental/optim/rprop.py +15 -26
- mindspore/experimental/optim/sgd.py +9 -19
- mindspore/hal/__init__.py +4 -4
- mindspore/hal/contiguous_tensors_handle.py +2 -2
- mindspore/hal/memory.py +1 -0
- mindspore/include/api/cell.h +65 -5
- mindspore/include/api/cfg.h +24 -7
- mindspore/include/api/context.h +1 -0
- mindspore/include/api/delegate.h +10 -2
- mindspore/include/api/dual_abi_helper.h +100 -19
- mindspore/include/api/graph.h +14 -1
- mindspore/include/api/kernel.h +16 -3
- mindspore/include/api/kernel_api.h +9 -1
- mindspore/include/api/metrics/accuracy.h +9 -0
- mindspore/include/api/model.h +8 -1
- mindspore/include/api/model_group.h +4 -0
- mindspore/include/api/model_parallel_runner.h +2 -0
- mindspore/include/api/status.h +48 -10
- mindspore/include/api/types.h +8 -3
- mindspore/include/c_api/model_c.h +0 -58
- mindspore/include/c_api/tensor_c.h +0 -26
- mindspore/include/dataset/constants.h +9 -0
- mindspore/include/dataset/vision_ascend.h +1 -1
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar10.py +61 -11
- mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mindspore_ops_host.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/__init__.py +4 -44
- mindspore/mint/distributed/__init__.py +5 -0
- mindspore/mint/distributed/distributed.py +425 -19
- mindspore/mint/nn/__init__.py +1 -1
- mindspore/mint/nn/functional.py +53 -6
- mindspore/mint/nn/layer/_functions.py +163 -294
- mindspore/mint/nn/layer/activation.py +8 -6
- mindspore/mint/nn/layer/conv.py +125 -101
- mindspore/mint/nn/layer/normalization.py +11 -25
- mindspore/mint/optim/adam.py +19 -18
- mindspore/mint/optim/adamw.py +14 -8
- mindspore/mint/optim/sgd.py +5 -5
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/nn/cell.py +488 -620
- mindspore/nn/grad/cell_grad.py +11 -12
- mindspore/nn/layer/activation.py +36 -36
- mindspore/nn/layer/basic.py +74 -77
- mindspore/nn/layer/channel_shuffle.py +4 -4
- mindspore/nn/layer/combined.py +4 -2
- mindspore/nn/layer/conv.py +86 -85
- mindspore/nn/layer/dense.py +9 -7
- mindspore/nn/layer/embedding.py +50 -52
- mindspore/nn/layer/image.py +38 -40
- mindspore/nn/layer/math.py +111 -112
- mindspore/nn/layer/normalization.py +56 -44
- mindspore/nn/layer/pooling.py +58 -63
- mindspore/nn/layer/rnn_cells.py +33 -33
- mindspore/nn/layer/rnns.py +56 -56
- mindspore/nn/layer/thor_layer.py +74 -73
- mindspore/nn/layer/transformer.py +11 -1
- mindspore/nn/learning_rate_schedule.py +20 -20
- mindspore/nn/loss/loss.py +79 -81
- mindspore/nn/optim/adam.py +2 -4
- mindspore/nn/optim/adasum.py +2 -2
- mindspore/nn/optim/lamb.py +1 -3
- mindspore/nn/optim/optimizer.py +1 -1
- mindspore/nn/optim/tft_wrapper.py +2 -3
- mindspore/nn/optim/thor.py +2 -2
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/probability/distribution/exponential.py +2 -1
- mindspore/nn/probability/distribution/poisson.py +2 -1
- mindspore/nn/sparse/sparse.py +3 -3
- mindspore/nn/wrap/cell_wrapper.py +73 -42
- mindspore/nn/wrap/grad_reducer.py +37 -52
- mindspore/nn/wrap/loss_scale.py +72 -74
- mindspore/numpy/array_creations.py +7 -7
- mindspore/numpy/fft.py +1 -1
- mindspore/numpy/math_ops.py +1 -1
- mindspore/numpy/utils_const.py +1 -1
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
- mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
- mindspore/ops/_grad_experimental/grad_inner_ops.py +0 -9
- mindspore/ops/_op_impl/cpu/__init__.py +1 -0
- mindspore/{experimental/es/__init__.py → ops/_op_impl/cpu/joinedstr_op.py} +12 -6
- mindspore/ops/_vmap/vmap_array_ops.py +6 -13
- mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +29 -10
- mindspore/ops/auto_generate/gen_extend_func.py +5 -55
- mindspore/ops/auto_generate/gen_ops_def.py +753 -273
- mindspore/ops/auto_generate/gen_ops_prim.py +1687 -958
- mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
- mindspore/ops/composite/__init__.py +10 -0
- mindspore/ops/composite/base.py +9 -5
- mindspore/ops/composite/multitype_ops/__init__.py +12 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +132 -108
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
- mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
- mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
- mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
- mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
- mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
- mindspore/ops/function/__init__.py +4 -1
- mindspore/ops/function/_add_attr_func.py +11 -6
- mindspore/ops/function/array_func.py +17 -100
- mindspore/ops/function/debug_func.py +8 -5
- mindspore/ops/function/grad/grad_func.py +5 -13
- mindspore/ops/function/math_func.py +65 -399
- mindspore/ops/function/nn_func.py +44 -61
- mindspore/ops/function/other_func.py +4 -1
- mindspore/ops/function/random_func.py +31 -4
- mindspore/ops/functional.py +2 -3
- mindspore/ops/functional_overload.py +486 -18
- mindspore/ops/op_info_register.py +21 -0
- mindspore/ops/operations/__init__.py +5 -2
- mindspore/ops/operations/_custom_ops_utils.py +675 -8
- mindspore/ops/operations/_inner_ops.py +14 -18
- mindspore/ops/operations/_sequence_ops.py +1 -1
- mindspore/ops/operations/array_ops.py +4 -50
- mindspore/ops/operations/comm_ops.py +186 -41
- mindspore/ops/operations/custom_ops.py +244 -175
- mindspore/ops/operations/debug_ops.py +55 -4
- mindspore/ops/operations/image_ops.py +13 -13
- mindspore/ops/operations/manually_defined/ops_def.py +27 -28
- mindspore/ops/operations/math_ops.py +8 -9
- mindspore/ops/operations/nn_ops.py +6 -7
- mindspore/ops/primitive.py +9 -20
- mindspore/ops/tensor_method.py +52 -11
- mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
- mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
- mindspore/ops_generate/api/functions_cc_generator.py +58 -10
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
- mindspore/ops_generate/common/base_generator.py +14 -0
- mindspore/ops_generate/common/gen_constants.py +7 -2
- mindspore/ops_generate/common/gen_utils.py +0 -19
- mindspore/ops_generate/common/op_proto.py +11 -4
- mindspore/ops_generate/common/template.py +88 -11
- mindspore/ops_generate/gen_ops.py +1 -1
- mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
- mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
- mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -16
- mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
- mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
- mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
- mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
- mindspore/parallel/_auto_parallel_context.py +9 -17
- mindspore/parallel/_cell_wrapper.py +106 -40
- mindspore/parallel/_parallel_serialization.py +4 -3
- mindspore/parallel/_ps_context.py +4 -6
- mindspore/parallel/_tensor.py +167 -12
- mindspore/parallel/_transformer/moe.py +1 -1
- mindspore/parallel/_transformer/transformer.py +17 -12
- mindspore/parallel/_utils.py +5 -11
- mindspore/parallel/auto_parallel.py +33 -12
- mindspore/parallel/checkpoint_convert.py +3 -3
- mindspore/parallel/checkpoint_transform.py +5 -1
- mindspore/parallel/cluster/process_entity/_api.py +88 -49
- mindspore/parallel/cluster/process_entity/_utils.py +95 -7
- mindspore/parallel/cluster/run.py +48 -7
- mindspore/parallel/function/__init__.py +8 -1
- mindspore/parallel/function/reshard_func.py +7 -6
- mindspore/parallel/nn/__init__.py +15 -2
- mindspore/parallel/nn/parallel_cell_wrapper.py +50 -14
- mindspore/parallel/nn/parallel_grad_reducer.py +7 -14
- mindspore/parallel/shard.py +9 -23
- mindspore/parallel/transform_safetensors.py +468 -174
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
- mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +3 -0
- mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
- mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
- mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
- mindspore/profiler/analysis/task_manager.py +1 -1
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
- mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +10 -9
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +43 -23
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
- mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
- mindspore/profiler/common/constant.py +16 -0
- mindspore/profiler/common/msprof_cmd_tool.py +2 -2
- mindspore/profiler/common/path_manager.py +9 -0
- mindspore/profiler/common/profiler_context.py +50 -29
- mindspore/profiler/common/profiler_info.py +0 -16
- mindspore/profiler/common/profiler_meta_data.py +1 -0
- mindspore/profiler/common/profiler_op_analyse.py +239 -0
- mindspore/profiler/common/profiler_output_path.py +23 -8
- mindspore/profiler/common/profiler_parameters.py +128 -35
- mindspore/profiler/dynamic_profile/__init__.py +0 -0
- mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
- mindspore/profiler/dynamic_profiler.py +374 -338
- mindspore/profiler/envprofiler.py +42 -12
- mindspore/profiler/experimental_config.py +112 -7
- mindspore/profiler/mstx.py +33 -12
- mindspore/profiler/platform/__init__.py +2 -3
- mindspore/profiler/platform/cpu_profiler.py +10 -4
- mindspore/profiler/platform/npu_profiler.py +30 -20
- mindspore/profiler/profiler.py +218 -154
- mindspore/profiler/profiler_action_controller.py +65 -77
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/profiler/schedule.py +10 -4
- mindspore/rewrite/common/config.py +1 -0
- mindspore/rewrite/common/namer.py +1 -0
- mindspore/rewrite/common/namespace.py +1 -0
- mindspore/rewrite/node/node.py +31 -11
- mindspore/rewrite/parsers/assign_parser.py +1 -1
- mindspore/rewrite/symbol_tree/symbol_tree.py +2 -2
- mindspore/run_check/_check_version.py +7 -10
- mindspore/runtime/__init__.py +8 -6
- mindspore/runtime/event.py +10 -4
- mindspore/runtime/executor.py +87 -45
- mindspore/runtime/memory.py +22 -30
- mindspore/runtime/thread_bind_core.py +299 -165
- mindspore/safeguard/rewrite_obfuscation.py +12 -13
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/_utils.py +9 -5
- mindspore/train/amp.py +43 -23
- mindspore/train/callback/__init__.py +5 -5
- mindspore/train/callback/_callback.py +2 -1
- mindspore/train/callback/_checkpoint.py +4 -14
- mindspore/train/callback/_flops_collector.py +11 -7
- mindspore/train/callback/_landscape.py +0 -1
- mindspore/train/callback/_train_fault_tolerance.py +72 -18
- mindspore/train/data_sink.py +15 -6
- mindspore/train/dataset_helper.py +14 -5
- mindspore/train/model.py +49 -47
- mindspore/train/serialization.py +168 -126
- mindspore/train/summary/summary_record.py +13 -2
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +3 -2
- mindspore/utils/dryrun.py +0 -6
- mindspore/utils/runtime_execution_order_check.py +162 -78
- mindspore/utils/sdc_detect.py +68 -0
- mindspore/utils/utils.py +14 -17
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/METADATA +5 -4
- {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/RECORD +400 -439
- mindspore/_deprecated/jit.py +0 -198
- mindspore/_extends/remote/kernel_build_server_ascend.py +0 -75
- mindspore/communication/_hccl_management.py +0 -297
- mindspore/experimental/es/embedding_service.py +0 -891
- mindspore/experimental/es/embedding_service_layer.py +0 -581
- mindspore/profiler/common/validator/__init__.py +0 -14
- mindspore/profiler/common/validator/validate_path.py +0 -84
- mindspore/profiler/parser/__init__.py +0 -14
- mindspore/profiler/parser/aicpu_data_parser.py +0 -272
- mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
- mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
- mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
- mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
- mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
- mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
- mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
- mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
- mindspore/profiler/parser/ascend_flops_generator.py +0 -116
- mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
- mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
- mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
- mindspore/profiler/parser/ascend_memory_generator.py +0 -185
- mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
- mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
- mindspore/profiler/parser/ascend_op_generator.py +0 -334
- mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
- mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
- mindspore/profiler/parser/base_timeline_generator.py +0 -483
- mindspore/profiler/parser/container.py +0 -229
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
- mindspore/profiler/parser/flops_parser.py +0 -531
- mindspore/profiler/parser/framework_enum.py +0 -111
- mindspore/profiler/parser/framework_parser.py +0 -464
- mindspore/profiler/parser/framework_struct.py +0 -61
- mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
- mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
- mindspore/profiler/parser/hccl_parser.py +0 -573
- mindspore/profiler/parser/hwts_log_parser.py +0 -122
- mindspore/profiler/parser/integrator.py +0 -526
- mindspore/profiler/parser/memory_usage_parser.py +0 -277
- mindspore/profiler/parser/minddata_analyzer.py +0 -800
- mindspore/profiler/parser/minddata_parser.py +0 -186
- mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
- mindspore/profiler/parser/op_intermediate_parser.py +0 -149
- mindspore/profiler/parser/optime_parser.py +0 -250
- mindspore/profiler/parser/profiler_info.py +0 -213
- mindspore/profiler/parser/step_trace_parser.py +0 -666
- mindspore/utils/hooks.py +0 -81
- /mindspore/common/{_auto_dynamic.py → dynamic_shape/_auto_dynamic.py} +0 -0
- {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/WHEEL +0 -0
- {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.6.0.dist-info → mindspore-2.7.0.dist-info}/top_level.txt +0 -0
|
@@ -29,10 +29,8 @@ import atexit
|
|
|
29
29
|
import glob
|
|
30
30
|
import json
|
|
31
31
|
import os
|
|
32
|
-
import queue
|
|
33
32
|
import signal
|
|
34
33
|
import stat
|
|
35
|
-
import subprocess
|
|
36
34
|
import warnings
|
|
37
35
|
|
|
38
36
|
import time
|
|
@@ -41,6 +39,7 @@ import multiprocessing
|
|
|
41
39
|
from importlib import import_module
|
|
42
40
|
import sys
|
|
43
41
|
import threading
|
|
42
|
+
from types import GeneratorType
|
|
44
43
|
|
|
45
44
|
import copy
|
|
46
45
|
import weakref
|
|
@@ -65,7 +64,6 @@ from mindspore.dataset.engine import samplers
|
|
|
65
64
|
from mindspore.dataset.engine.samplers import Shuffle
|
|
66
65
|
from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
|
|
67
66
|
ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
|
|
68
|
-
from .queue import _SharedQueue, _Queue
|
|
69
67
|
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
|
70
68
|
check_rename, check_device_send, check_take, check_output_shape, check_project, \
|
|
71
69
|
check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
|
|
@@ -73,7 +71,8 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
|
|
|
73
71
|
check_total_batch, check_sync_update
|
|
74
72
|
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
|
|
75
73
|
get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, \
|
|
76
|
-
_get_debug_hook_list, get_multiprocessing_start_method
|
|
74
|
+
_get_debug_hook_list, get_multiprocessing_start_method, get_video_backend, set_video_backend, \
|
|
75
|
+
get_error_samples_mode, ErrorSamplesMode
|
|
77
76
|
from ..core.datatypes import mstype_to_detype
|
|
78
77
|
from ..core.validator_helpers import replace_none
|
|
79
78
|
from ..core.py_util_helpers import ExceptionHandler
|
|
@@ -575,6 +574,12 @@ class Dataset:
|
|
|
575
574
|
create shared memory, and represents ``output_columns`` use the second element as the
|
|
576
575
|
unit to create shared memory.
|
|
577
576
|
|
|
577
|
+
.. warning::
|
|
578
|
+
`batch` uses `dill` module implicitly in multiprocessing `spawn` mode to serialize/deserialize
|
|
579
|
+
`per_batch_map`, which is known to be insecure. It is possible to construct malicious pickle data
|
|
580
|
+
which will execute arbitrary code during unpickling. Never load data that could have come from
|
|
581
|
+
untrusted sources, or has been tampered with.
|
|
582
|
+
|
|
578
583
|
Returns:
|
|
579
584
|
Dataset, a new dataset with the above operation applied.
|
|
580
585
|
|
|
@@ -886,6 +891,12 @@ class Dataset:
|
|
|
886
891
|
|
|
887
892
|
- offload (bool, optional): Flag to indicate whether offload is used. Default: ``None``.
|
|
888
893
|
|
|
894
|
+
.. warning::
|
|
895
|
+
`map` uses `dill` module implicitly in multiprocessing `spawn` mode to serialize/deserialize `operations`,
|
|
896
|
+
which is known to be insecure. It is possible to construct malicious pickle data which will
|
|
897
|
+
execute arbitrary code during unpickling. Never load data that could have come from untrusted sources,
|
|
898
|
+
or has been tampered with.
|
|
899
|
+
|
|
889
900
|
Note:
|
|
890
901
|
- Input `operations` accepts TensorOperations defined in mindspore.dataset part, plus user-defined
|
|
891
902
|
Python functions (PyFuncs).
|
|
@@ -1557,7 +1568,7 @@ class Dataset:
|
|
|
1557
1568
|
del api_tree
|
|
1558
1569
|
|
|
1559
1570
|
@check_tuple_iterator
|
|
1560
|
-
def create_tuple_iterator(self, columns=None, num_epochs=-1, output_numpy=False, do_copy=
|
|
1571
|
+
def create_tuple_iterator(self, columns=None, num_epochs=-1, output_numpy=False, do_copy=False):
|
|
1561
1572
|
"""
|
|
1562
1573
|
Create an iterator over the dataset that yields samples of type list, whose elements are
|
|
1563
1574
|
the data for each column.
|
|
@@ -1571,7 +1582,7 @@ class Dataset:
|
|
|
1571
1582
|
convert it to Tensor. Default: ``False`` .
|
|
1572
1583
|
do_copy (bool, optional): Whether to copy the data when converting output to Tensor,
|
|
1573
1584
|
or reuse the buffer for better performance, only works when `output_numpy` is ``False`` .
|
|
1574
|
-
Default: ``
|
|
1585
|
+
Default: ``False`` .
|
|
1575
1586
|
|
|
1576
1587
|
Returns:
|
|
1577
1588
|
Iterator, a dataset iterator that yields samples of type list.
|
|
@@ -1598,7 +1609,7 @@ class Dataset:
|
|
|
1598
1609
|
return TupleIterator(self, columns, num_epochs, output_numpy, do_copy)
|
|
1599
1610
|
|
|
1600
1611
|
@check_dict_iterator
|
|
1601
|
-
def create_dict_iterator(self, num_epochs=-1, output_numpy=False, do_copy=
|
|
1612
|
+
def create_dict_iterator(self, num_epochs=-1, output_numpy=False, do_copy=False):
|
|
1602
1613
|
"""
|
|
1603
1614
|
Create an iterator over the dataset that yields samples of type dict,
|
|
1604
1615
|
while the key is the column name and the value is the data.
|
|
@@ -1610,7 +1621,7 @@ class Dataset:
|
|
|
1610
1621
|
convert it to Tensor. Default: ``False`` .
|
|
1611
1622
|
do_copy (bool, optional): Whether to copy the data when converting output to Tensor,
|
|
1612
1623
|
or reuse the buffer for better performance, only works when `output_numpy` is ``False`` .
|
|
1613
|
-
Default: ``
|
|
1624
|
+
Default: ``False`` .
|
|
1614
1625
|
|
|
1615
1626
|
Returns:
|
|
1616
1627
|
Iterator, a dataset iterator that yields samples of type dict.
|
|
@@ -2740,8 +2751,6 @@ class BatchDataset(UnionBaseDataset):
|
|
|
2740
2751
|
|
|
2741
2752
|
self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(), self.num_parallel_workers,
|
|
2742
2753
|
str(self), [self.per_batch_map], self.max_rowsize)
|
|
2743
|
-
# Wrap per_batch_map into _PythonCallable
|
|
2744
|
-
self.per_batch_map = _PythonCallable(self.per_batch_map, 0, self.process_pool)
|
|
2745
2754
|
else:
|
|
2746
2755
|
if self.per_batch_map is not None:
|
|
2747
2756
|
self.per_batch_map = FuncWrapper(self.per_batch_map)
|
|
@@ -3045,95 +3054,6 @@ _OP_NAME = dict()
|
|
|
3045
3054
|
_OP_PROCESS = dict()
|
|
3046
3055
|
|
|
3047
3056
|
|
|
3048
|
-
# PythonCallable wrapper for multiprocess pyfunc
|
|
3049
|
-
class _PythonCallable:
|
|
3050
|
-
"""
|
|
3051
|
-
Internal Python function wrapper for multiprocessing pyfunc.
|
|
3052
|
-
"""
|
|
3053
|
-
|
|
3054
|
-
def __init__(self, py_callable, idx, pool=None):
|
|
3055
|
-
# Original Python callable from user.
|
|
3056
|
-
self.py_callable = py_callable
|
|
3057
|
-
# Process pool created for current iterator.
|
|
3058
|
-
self.pool = pool
|
|
3059
|
-
# Python callable index
|
|
3060
|
-
self.idx = idx
|
|
3061
|
-
|
|
3062
|
-
def __call__(self, *args):
|
|
3063
|
-
result = None
|
|
3064
|
-
get_data_from_worker_process = False
|
|
3065
|
-
while get_data_from_worker_process is False:
|
|
3066
|
-
if self.pool.is_running() and check_iterator_cleanup() is False:
|
|
3067
|
-
try:
|
|
3068
|
-
result = self.pool.execute(self.idx, *args)
|
|
3069
|
-
except multiprocessing.TimeoutError:
|
|
3070
|
-
continue
|
|
3071
|
-
get_data_from_worker_process = True
|
|
3072
|
-
else:
|
|
3073
|
-
# worker process is stopped
|
|
3074
|
-
logger.info("The worker process of map operation is stopped. "
|
|
3075
|
-
"So return None to main thread and break the main thread.")
|
|
3076
|
-
return None
|
|
3077
|
-
# got value from worker process
|
|
3078
|
-
if not isinstance(result, tuple) and get_data_from_worker_process is True:
|
|
3079
|
-
result = (result,)
|
|
3080
|
-
return result
|
|
3081
|
-
|
|
3082
|
-
def to_json(self):
|
|
3083
|
-
return self.py_callable.to_json()
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
# used when python_multiprocessing=True in map
|
|
3087
|
-
class Pipe:
|
|
3088
|
-
"""
|
|
3089
|
-
Class to handle communication between the master process and the worker processes.
|
|
3090
|
-
"""
|
|
3091
|
-
|
|
3092
|
-
def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
|
|
3093
|
-
self.shared_memory = shared_memory
|
|
3094
|
-
self.eof = multiprocessing.Event()
|
|
3095
|
-
if self.shared_memory:
|
|
3096
|
-
self.in_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[0])
|
|
3097
|
-
self.res_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[1])
|
|
3098
|
-
else:
|
|
3099
|
-
self.in_queue = _Queue(1)
|
|
3100
|
-
self.res_queue = _Queue(1)
|
|
3101
|
-
self.in_queue.cancel_join_thread() # Ensure that the process does not hung when exiting
|
|
3102
|
-
|
|
3103
|
-
def master_send(self, func_index, data):
|
|
3104
|
-
self.in_queue.put_nowait((func_index, *data))
|
|
3105
|
-
|
|
3106
|
-
def master_receive(self):
|
|
3107
|
-
if self.eof is None:
|
|
3108
|
-
raise RuntimeError("EOF is none when get data from worker.")
|
|
3109
|
-
if self.eof.is_set():
|
|
3110
|
-
return None
|
|
3111
|
-
return self.res_queue.get(timeout=1)
|
|
3112
|
-
|
|
3113
|
-
def master_close(self):
|
|
3114
|
-
self.eof.set()
|
|
3115
|
-
self.send_finish_signal_to_worker()
|
|
3116
|
-
self.send_finish_signal()
|
|
3117
|
-
|
|
3118
|
-
def send_finish_signal(self):
|
|
3119
|
-
self.worker_send(None)
|
|
3120
|
-
|
|
3121
|
-
def send_finish_signal_to_worker(self):
|
|
3122
|
-
self.master_send(0, "QUIT")
|
|
3123
|
-
|
|
3124
|
-
def worker_send(self, data):
|
|
3125
|
-
self.res_queue.put_until(data, timeout=1, exit_signal=self.eof)
|
|
3126
|
-
|
|
3127
|
-
def worker_receive(self):
|
|
3128
|
-
result = self.in_queue.get_until(timeout=1, exit_signal=self.eof)
|
|
3129
|
-
if result is None:
|
|
3130
|
-
return result
|
|
3131
|
-
if len(result) == 1:
|
|
3132
|
-
raise RuntimeError(f"Corrupted data. Worker received {len(result)} elements, it should be more than 1.")
|
|
3133
|
-
func_index, *data = result
|
|
3134
|
-
return func_index, tuple(data)
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
3057
|
def _main_process_already_exit():
|
|
3138
3058
|
"""
|
|
3139
3059
|
Judge whether main process already exit.
|
|
@@ -3146,15 +3066,18 @@ def _main_process_already_exit():
|
|
|
3146
3066
|
return False
|
|
3147
3067
|
|
|
3148
3068
|
|
|
3149
|
-
def _worker_loop(operations,
|
|
3069
|
+
def _worker_loop(quit_signal, operations, worker_id, op_type, key, video_backend=None):
|
|
3150
3070
|
"""
|
|
3151
3071
|
Multiprocess worker process loop.
|
|
3072
|
+
The worker process(Python Layer) gets data from / sends data to map / batch thread(C++ layer) by message queue
|
|
3073
|
+
and shared memory. This logic no longer uses the Python multi-process pool, in_queue, and out_queue for
|
|
3074
|
+
data transferring.
|
|
3152
3075
|
"""
|
|
3153
3076
|
# Initialize C++ side signal handlers
|
|
3154
3077
|
cde.register_worker_handlers()
|
|
3155
3078
|
|
|
3156
|
-
|
|
3157
|
-
|
|
3079
|
+
if video_backend is not None:
|
|
3080
|
+
set_video_backend(video_backend)
|
|
3158
3081
|
|
|
3159
3082
|
def _ignore_sigint():
|
|
3160
3083
|
"""
|
|
@@ -3168,121 +3091,197 @@ def _worker_loop(operations, pipe, worker_id):
|
|
|
3168
3091
|
if get_seed() != 5489:
|
|
3169
3092
|
set_seed(get_seed() + worker_id)
|
|
3170
3093
|
|
|
3094
|
+
msg_queue = cde.MessageQueue(key)
|
|
3095
|
+
msg_queue.set_release_flag(False)
|
|
3096
|
+
shm_queue = cde.SharedMemoryQueue(key)
|
|
3097
|
+
shm_queue.set_release_flag(False)
|
|
3098
|
+
|
|
3099
|
+
pid = str(os.getpid())
|
|
3100
|
+
ppid = str(os.getppid())
|
|
3101
|
+
|
|
3102
|
+
# Scenario: when the main process is killed, worker processe needs to release shm & msg.
|
|
3103
|
+
# The shm id and msg id should be released by SIGTERM in worker handler
|
|
3104
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3105
|
+
msg_queue.msg_queue_id)
|
|
3106
|
+
|
|
3107
|
+
num_receive = 0
|
|
3108
|
+
num_send = 0
|
|
3171
3109
|
while not _main_process_already_exit():
|
|
3172
3110
|
_ignore_sigint()
|
|
3173
3111
|
|
|
3174
|
-
|
|
3175
|
-
if
|
|
3112
|
+
# quit by close_worker
|
|
3113
|
+
if quit_signal.is_set():
|
|
3176
3114
|
return
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3115
|
+
|
|
3116
|
+
# >> receive procedure >>
|
|
3117
|
+
## 1. get message queue which contains shared memory info from map C++ thread in main process
|
|
3180
3118
|
try:
|
|
3181
|
-
|
|
3119
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3120
|
+
msg_queue.msg_queue_id)
|
|
3121
|
+
msg_queue.msg_rcv(cde.MASTER_SEND_DATA_MSG)
|
|
3122
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3123
|
+
msg_queue.msg_queue_id)
|
|
3124
|
+
except RuntimeError as err:
|
|
3125
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3126
|
+
msg_queue.msg_queue_id)
|
|
3127
|
+
# the msg_queue had been released by main process, ignore it in worker process
|
|
3128
|
+
if "errno: 2" in str(err):
|
|
3129
|
+
# Because the worker process does not release msg and shm, continue
|
|
3130
|
+
continue
|
|
3131
|
+
raise err
|
|
3182
3132
|
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3133
|
+
## when the message queue had been released, break the loop
|
|
3134
|
+
if msg_queue.message_queue_state() == cde.MessageState.RELEASED:
|
|
3135
|
+
logger.info("The message queue had been released, worker loop end.")
|
|
3136
|
+
break
|
|
3187
3137
|
|
|
3188
|
-
|
|
3189
|
-
del pipe.in_queue
|
|
3190
|
-
del pipe.res_queue
|
|
3138
|
+
num_receive += 1
|
|
3191
3139
|
|
|
3140
|
+
logger.info("Python process {} worker({}) receives {} samples from map thread.".format(op_type, worker_id,
|
|
3141
|
+
num_receive))
|
|
3192
3142
|
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3143
|
+
# convert the data from shm to python data
|
|
3144
|
+
if op_type == cde.MAP_OP:
|
|
3145
|
+
## 2. construct shared memory to TensorRow which contains one / more columns
|
|
3146
|
+
tensor_row = shm_queue.to_tensor_row(msg_queue.shm_id, msg_queue.shm_size)
|
|
3196
3147
|
|
|
3148
|
+
## 3. convert TensorRow to Python tuple which elements are a column
|
|
3149
|
+
tuple_column = cde.convert_tensor_row_to_py_tuple(tensor_row)
|
|
3197
3150
|
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
self.worker_id = worker_id
|
|
3203
|
-
logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
|
|
3151
|
+
py_func_input = tuple_column
|
|
3152
|
+
elif op_type == cde.BATCH_OP:
|
|
3153
|
+
## 2. construct shard memory to TensorTable which contains one / more TensorRow & CBatchInfo
|
|
3154
|
+
tensor_table, batch_info, _ = shm_queue.to_tensor_table(msg_queue.shm_id, msg_queue.shm_size)
|
|
3204
3155
|
|
|
3205
|
-
|
|
3206
|
-
|
|
3156
|
+
## 3. convert TensorTable to Python tuple tuple
|
|
3157
|
+
# The tuple indicate the multi columns
|
|
3158
|
+
# The list indicate the multi rows
|
|
3159
|
+
tuple_list_column = cde.convert_tensor_table_to_py_tuple_list(tensor_table)
|
|
3207
3160
|
|
|
3161
|
+
py_func_input = (*tuple_list_column, batch_info)
|
|
3162
|
+
else:
|
|
3163
|
+
raise RuntimeError("The op_type: {} is invalid.".format(op_type))
|
|
3208
3164
|
|
|
3209
|
-
|
|
3210
|
-
|
|
3211
|
-
|
|
3212
|
-
"""
|
|
3165
|
+
# execute the pyfunc
|
|
3166
|
+
try:
|
|
3167
|
+
py_func_output = py_func_input
|
|
3213
3168
|
|
|
3214
|
-
|
|
3215
|
-
|
|
3216
|
-
|
|
3217
|
-
|
|
3218
|
-
super().__init__(target=worker_target(operations, worker_id), name="MapWorker" + str(worker_id),
|
|
3219
|
-
args=(self.pipe,), daemon=True)
|
|
3220
|
-
|
|
3221
|
-
def execute(self, idx, *args):
|
|
3222
|
-
"""Acquiring data from a worker in an infinite loop"""
|
|
3223
|
-
self.pipe.master_send(idx, args)
|
|
3224
|
-
time_s = time.time()
|
|
3225
|
-
wait_count = 1
|
|
3226
|
-
while True:
|
|
3227
|
-
cost_time = time.time() - time_s
|
|
3228
|
-
if cost_time / self.check_interval >= wait_count:
|
|
3229
|
-
wait_count += 1
|
|
3230
|
-
logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
|
|
3231
|
-
"worker of the map operation is hanging. "
|
|
3232
|
-
"Check whether the user defined data transform is too slow or the "
|
|
3233
|
-
"output data is too large. You can also set the timeout interval by "
|
|
3234
|
-
"ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
|
|
3235
|
-
"of this log.")
|
|
3236
|
-
pid = self.pid
|
|
3237
|
-
logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
|
|
3238
|
-
install_status, _ = subprocess.getstatusoutput("py-spy --version")
|
|
3239
|
-
if install_status == 0:
|
|
3240
|
-
stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
|
|
3241
|
-
logger.warning("Map worker subprocess stack:\n{}".format(stack))
|
|
3169
|
+
# execute the remaining operations
|
|
3170
|
+
for idx in range(len(operations)):
|
|
3171
|
+
if isinstance(py_func_output, tuple):
|
|
3172
|
+
py_func_output = operations[idx](*py_func_output)
|
|
3242
3173
|
else:
|
|
3243
|
-
|
|
3174
|
+
py_func_output = operations[idx](py_func_output)
|
|
3175
|
+
|
|
3176
|
+
# << send procedure <<
|
|
3177
|
+
# the result is None
|
|
3178
|
+
if py_func_output is None:
|
|
3179
|
+
raise RuntimeError("Got None from Python Function which is defined by {}".format(op_type))
|
|
3180
|
+
|
|
3181
|
+
# convert the output to tuple
|
|
3182
|
+
if not isinstance(py_func_output, tuple):
|
|
3183
|
+
py_func_output = (py_func_output,)
|
|
3184
|
+
|
|
3185
|
+
if op_type == cde.MAP_OP:
|
|
3186
|
+
# check if the map return Generator type
|
|
3187
|
+
for item in py_func_output:
|
|
3188
|
+
if isinstance(item, GeneratorType):
|
|
3189
|
+
raise RuntimeError("Cannot pickle <class 'generator'> object, please verify pyfunc "
|
|
3190
|
+
"return with numpy array")
|
|
3191
|
+
|
|
3192
|
+
## 1. convert Python tuple to TensorRow
|
|
3193
|
+
output_tensor_row = cde.convert_py_tuple_to_tensor_row(py_func_output)
|
|
3194
|
+
|
|
3195
|
+
## 2. convert TensorRow to shared memory
|
|
3196
|
+
shm_queue.from_tensor_row(output_tensor_row)
|
|
3197
|
+
elif op_type == cde.BATCH_OP:
|
|
3198
|
+
## 1. convert Python tuple tuple to TensorTable
|
|
3199
|
+
output_tensor_table, concat_batch = cde.convert_py_tuple_list_to_tensor_table(py_func_output)
|
|
3200
|
+
|
|
3201
|
+
## 2. convert TensorTable to shared memory
|
|
3202
|
+
shm_queue.from_tensor_table(output_tensor_table, batch_info, concat_batch)
|
|
3203
|
+
else:
|
|
3204
|
+
raise RuntimeError("The op_type: {} is invalid.".format(op_type))
|
|
3205
|
+
|
|
3206
|
+
## 3. send message queue which contains shared memory to map C++ thread in main process
|
|
3207
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3208
|
+
msg_queue.msg_queue_id)
|
|
3209
|
+
msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
|
|
3210
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3211
|
+
msg_queue.msg_queue_id)
|
|
3212
|
+
|
|
3213
|
+
num_send += 1
|
|
3214
|
+
logger.info("Python process {} worker({}) sends {} samples to map thread.".format(op_type, worker_id,
|
|
3215
|
+
num_send))
|
|
3216
|
+
except Exception:
|
|
3244
3217
|
try:
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3218
|
+
if op_type == cde.MAP_OP:
|
|
3219
|
+
pyfunc_err = ExceptionHandler(where="in map worker and execute Python function")
|
|
3220
|
+
elif op_type == cde.BATCH_OP:
|
|
3221
|
+
pyfunc_err = ExceptionHandler(where="in batch(per_batch_map) worker and execute Python function")
|
|
3222
|
+
else:
|
|
3223
|
+
pyfunc_err = "The op_type: {} is invalid.".format(op_type)
|
|
3224
|
+
pyfunc_err.reraise()
|
|
3225
|
+
except Exception as err:
|
|
3226
|
+
_, _, exc_tb = sys.exc_info()
|
|
3227
|
+
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
3228
|
+
|
|
3229
|
+
if op_type == cde.MAP_OP:
|
|
3230
|
+
logger.info("Got exception {} from Map Worker({})".format(str(err), worker_id))
|
|
3231
|
+
elif op_type == cde.BATCH_OP:
|
|
3232
|
+
logger.info("Got exception {} from Batch Worker({})".format(str(err), worker_id))
|
|
3233
|
+
else:
|
|
3234
|
+
logger.info("The op_type: {} is invalid.".format(op_type))
|
|
3235
|
+
|
|
3236
|
+
# err_code, lineno, filename, err_desc
|
|
3237
|
+
msg_queue.serialize_status(cde.StatusCode.MD_PY_FUNC_EXCEPTION, exc_tb.tb_lineno, fname, str(err))
|
|
3238
|
+
|
|
3239
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3240
|
+
msg_queue.msg_queue_id)
|
|
3241
|
+
msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
|
|
3242
|
+
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3243
|
+
msg_queue.msg_queue_id)
|
|
3244
|
+
|
|
3245
|
+
# worker error
|
|
3246
|
+
if get_error_samples_mode() == ErrorSamplesMode.RETURN:
|
|
3247
|
+
break
|
|
3248
|
+
else:
|
|
3249
|
+
# continue the loop, when the get_error_samples_mode() is REPLACE or SKIP
|
|
3250
|
+
continue
|
|
3251
|
+
|
|
3252
|
+
# release the eager executor which is used by current process
|
|
3253
|
+
transforms.transforms.clean_unused_executors()
|
|
3254
|
+
|
|
3255
|
+
while not _main_process_already_exit():
|
|
3256
|
+
# quit by close_worker
|
|
3257
|
+
if quit_signal.is_set():
|
|
3278
3258
|
return
|
|
3279
|
-
return
|
|
3280
3259
|
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
|
|
3260
|
+
logger.info("The worker process is waiting for the main process to exit.")
|
|
3261
|
+
time.sleep(0.1)
|
|
3262
|
+
|
|
3263
|
+
# the main process is not exist yet which maybe killed -9
|
|
3264
|
+
msg_queue.set_release_flag(True)
|
|
3265
|
+
msg_queue.release()
|
|
3266
|
+
shm_queue.set_release_flag(True)
|
|
3267
|
+
shm_queue.release()
|
|
3268
|
+
|
|
3269
|
+
|
|
3270
|
+
class WorkerTarget:
|
|
3271
|
+
"""Mulitprocess mode for dataset map or batch"""
|
|
3272
|
+
def __init__(self, quit_signal, operations, worker_id, op_type, ftok_key):
|
|
3273
|
+
self.quit_signal = quit_signal
|
|
3274
|
+
self.operations = operations
|
|
3275
|
+
self.worker_id = worker_id
|
|
3276
|
+
self.op_type = op_type
|
|
3277
|
+
self.ftok_key = ftok_key
|
|
3278
|
+
start_method = multiprocessing.get_start_method()
|
|
3279
|
+
logger.info("Multiprocessing start method: {}".format(start_method))
|
|
3280
|
+
self.video_backend = get_video_backend() if start_method == 'spawn' else None
|
|
3281
|
+
|
|
3282
|
+
def __call__(self):
|
|
3283
|
+
return _worker_loop(self.quit_signal, self.operations, self.worker_id, self.op_type, self.ftok_key,
|
|
3284
|
+
self.video_backend)
|
|
3286
3285
|
|
|
3287
3286
|
|
|
3288
3287
|
def worker_is_alive(worker):
|
|
@@ -3293,24 +3292,31 @@ def worker_is_alive(worker):
|
|
|
3293
3292
|
return False
|
|
3294
3293
|
|
|
3295
3294
|
|
|
3296
|
-
def close_worker(worker,
|
|
3295
|
+
def close_worker(worker, eof):
|
|
3297
3296
|
"""Close the subprocess worker in spawn mode"""
|
|
3298
3297
|
try:
|
|
3299
3298
|
if worker_is_alive(worker):
|
|
3300
3299
|
# release the eager executor which is used by current process
|
|
3301
3300
|
transforms.transforms.clean_unused_executors()
|
|
3302
3301
|
|
|
3303
|
-
|
|
3304
|
-
|
|
3302
|
+
# let the worker exit
|
|
3303
|
+
logger.info("Set eof flag for worker with PID: {}.".format(worker.pid))
|
|
3304
|
+
eof.set()
|
|
3305
|
+
|
|
3306
|
+
# wait timeout
|
|
3307
|
+
wait_timeout = 2
|
|
3308
|
+
start_time = time.time()
|
|
3305
3309
|
|
|
3306
3310
|
process_dir = os.path.join('/proc', str(worker.pid))
|
|
3307
3311
|
while worker_is_alive(worker) and os.path.exists(process_dir):
|
|
3308
3312
|
logger.info("Waiting for worker {} closed ...".format(worker.pid))
|
|
3309
3313
|
time.sleep(0.5)
|
|
3310
3314
|
|
|
3315
|
+
# maybe the worker is hung by msg_queue.MsgRcv, so break the loop and terminate it in next step
|
|
3316
|
+
if time.time() - start_time > wait_timeout:
|
|
3317
|
+
break
|
|
3318
|
+
|
|
3311
3319
|
# del the handle which hold by master
|
|
3312
|
-
del pipe.in_queue
|
|
3313
|
-
del pipe.res_queue
|
|
3314
3320
|
worker.terminate()
|
|
3315
3321
|
worker.join()
|
|
3316
3322
|
worker.close()
|
|
@@ -3367,7 +3373,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3367
3373
|
self.warning_ctl = None
|
|
3368
3374
|
# cache thread (get_ident()) to worker_id mapping in Python layer
|
|
3369
3375
|
self.python_threads_to_workers = {}
|
|
3370
|
-
self.
|
|
3376
|
+
self.eof_workers = []
|
|
3377
|
+
self.eof_clean_process = None
|
|
3371
3378
|
self.running = False
|
|
3372
3379
|
|
|
3373
3380
|
def __del__(self):
|
|
@@ -3443,19 +3450,39 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3443
3450
|
del workers
|
|
3444
3451
|
os.kill(os.getpid(), signal.SIGTERM)
|
|
3445
3452
|
|
|
3446
|
-
def launch(self, op_id
|
|
3453
|
+
def launch(self, op_id, op_type, ftok_keys):
|
|
3447
3454
|
"""
|
|
3448
3455
|
Launch Python multiprocessing pool.
|
|
3449
3456
|
|
|
3450
3457
|
Args:
|
|
3451
|
-
op_id: ID for operation to have Python multiprocessing pool launched
|
|
3458
|
+
op_id (int): ID for operation to have Python multiprocessing pool launched
|
|
3459
|
+
op_type (str): Indicate MapOp / BatchOp
|
|
3460
|
+
ftok_keys (list[int]): the ftok key of list for msg queue and shm queue
|
|
3452
3461
|
|
|
3453
3462
|
Returns:
|
|
3454
3463
|
Python multiprocessing pool is launched.
|
|
3455
3464
|
"""
|
|
3456
3465
|
self.python_threads_to_workers = {}
|
|
3466
|
+
|
|
3467
|
+
if not isinstance(op_id, int):
|
|
3468
|
+
raise RuntimeError("The op_id is not int.")
|
|
3457
3469
|
self.op_id = op_id
|
|
3458
|
-
|
|
3470
|
+
|
|
3471
|
+
valid_op_type = [cde.MAP_OP, cde.BATCH_OP]
|
|
3472
|
+
if op_type not in valid_op_type:
|
|
3473
|
+
raise RuntimeError("The op_type: {} is not in {}.".format(op_type, valid_op_type))
|
|
3474
|
+
self.op_type = op_type
|
|
3475
|
+
|
|
3476
|
+
if not isinstance(ftok_keys, list):
|
|
3477
|
+
raise RuntimeError("The ftok_keys is not a list.")
|
|
3478
|
+
if not all(isinstance(x, int) for x in ftok_keys):
|
|
3479
|
+
raise RuntimeError("The item in ftok_keys is not all int.")
|
|
3480
|
+
if len(ftok_keys) != self.num_parallel_workers:
|
|
3481
|
+
raise RuntimeError("The len of ftok_keys is not equal to num_parallel_workers.")
|
|
3482
|
+
self.ftok_keys = ftok_keys
|
|
3483
|
+
|
|
3484
|
+
logger.info("Launching new Python multiprocessing pool for Op: " + self.op_type + "(" + str(self.op_id) + \
|
|
3485
|
+
"), ftok_keys: " + str(self.ftok_keys))
|
|
3459
3486
|
if self.is_mp_enabled():
|
|
3460
3487
|
message = "Launching a new Python multiprocessing pool while a pool already exists!" + \
|
|
3461
3488
|
" The existing pool will be terminated first."
|
|
@@ -3478,30 +3505,21 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3478
3505
|
raise Exception("Pool was already created, close it first.")
|
|
3479
3506
|
|
|
3480
3507
|
self.workers = []
|
|
3481
|
-
self.pipes = []
|
|
3482
|
-
self.check_interval = get_multiprocessing_timeout_interval()
|
|
3483
3508
|
self.warning_ctl = multiprocessing.Value('i', 0)
|
|
3484
|
-
if self.start_method == "fork":
|
|
3485
|
-
# Construct python worker processes
|
|
3486
|
-
for worker_id in range(self.num_parallel_workers):
|
|
3487
|
-
worker = _MPWorker(self.operations, self.warning_ctl, self.max_rowsize, worker_id)
|
|
3488
|
-
worker.start()
|
|
3489
|
-
self.workers.append(worker)
|
|
3490
|
-
else:
|
|
3491
|
-
multiprocessing.set_start_method(self.start_method, True)
|
|
3492
3509
|
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3510
|
+
multiprocessing.set_start_method(self.start_method, True)
|
|
3511
|
+
|
|
3512
|
+
# Construct python worker processes
|
|
3513
|
+
for worker_id in range(self.num_parallel_workers):
|
|
3514
|
+
eof = multiprocessing.Event()
|
|
3515
|
+
worker = multiprocessing.Process(target=WorkerTarget(eof, self.operations, worker_id, self.op_type,
|
|
3516
|
+
self.ftok_keys[worker_id]),
|
|
3517
|
+
name="MapWorker" + str(worker_id), daemon=True)
|
|
3518
|
+
self.eof_workers.append(eof)
|
|
3519
|
+
self.workers.append(worker)
|
|
3520
|
+
worker.start()
|
|
3503
3521
|
|
|
3504
|
-
|
|
3522
|
+
multiprocessing.set_start_method("fork", True)
|
|
3505
3523
|
|
|
3506
3524
|
logger.info("Launch worker process(es): {}".format(self.get_pids()))
|
|
3507
3525
|
|
|
@@ -3515,6 +3533,20 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3515
3533
|
atexit.register(lambda cleanup: cleanup()() if cleanup() is not None else None,
|
|
3516
3534
|
weakref.WeakMethod(self.terminate))
|
|
3517
3535
|
|
|
3536
|
+
# Ensure that all workers are in the running state
|
|
3537
|
+
start = time.time()
|
|
3538
|
+
wait_time = 120 # 120s
|
|
3539
|
+
while True:
|
|
3540
|
+
if self.is_running():
|
|
3541
|
+
logger.info("All workers has been running state.")
|
|
3542
|
+
break
|
|
3543
|
+
else:
|
|
3544
|
+
time.sleep(0.5)
|
|
3545
|
+
if time.time() - start > wait_time:
|
|
3546
|
+
logger.error("All worker processes have not reached the running state within " + str(wait_time) +
|
|
3547
|
+
" seconds, data processing errors may occur.")
|
|
3548
|
+
break
|
|
3549
|
+
|
|
3518
3550
|
def terminate(self):
|
|
3519
3551
|
if self.running:
|
|
3520
3552
|
# abort the monitor first and then close all the workers
|
|
@@ -3543,7 +3575,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3543
3575
|
continue
|
|
3544
3576
|
return self.pids
|
|
3545
3577
|
|
|
3546
|
-
def add_new_workers(self, num_new_workers):
|
|
3578
|
+
def add_new_workers(self, num_new_workers, op_type, ftok_keys):
|
|
3579
|
+
"""Used by AutoTune"""
|
|
3547
3580
|
logger.info(
|
|
3548
3581
|
"Increasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
|
|
3549
3582
|
", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
|
|
@@ -3551,9 +3584,14 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3551
3584
|
num_new_workers) + ".")
|
|
3552
3585
|
self.terminate()
|
|
3553
3586
|
self.num_parallel_workers += num_new_workers
|
|
3554
|
-
self.launch(self.op_id)
|
|
3555
3587
|
|
|
3556
|
-
|
|
3588
|
+
if self.num_parallel_workers != len(ftok_keys):
|
|
3589
|
+
raise RuntimeError("Add new workers failed, the num_workers is not equal size of ftok_keys.")
|
|
3590
|
+
|
|
3591
|
+
self.launch(self.op_id, op_type, ftok_keys)
|
|
3592
|
+
|
|
3593
|
+
def remove_workers(self, num_removed_workers, op_type, ftok_keys):
|
|
3594
|
+
"""Used by AutoTune"""
|
|
3557
3595
|
logger.info(
|
|
3558
3596
|
"Decreasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
|
|
3559
3597
|
", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
|
|
@@ -3561,59 +3599,14 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3561
3599
|
num_removed_workers) + ".")
|
|
3562
3600
|
self.terminate()
|
|
3563
3601
|
self.num_parallel_workers -= num_removed_workers
|
|
3564
|
-
self.launch(self.op_id)
|
|
3565
3602
|
|
|
3566
|
-
|
|
3567
|
-
|
|
3603
|
+
if self.num_parallel_workers != len(ftok_keys):
|
|
3604
|
+
raise RuntimeError("Remove workers failed, the num_workers is not equal size of ftok_keys.")
|
|
3568
3605
|
|
|
3569
|
-
|
|
3570
|
-
"""
|
|
3571
|
-
Execute
|
|
3572
|
-
"""
|
|
3573
|
-
t_id = threading.get_ident()
|
|
3574
|
-
# get the worker_id from Python layer cache first, get from Cpp layer if not found.
|
|
3575
|
-
worker_id = self.python_threads_to_workers.setdefault(t_id, self.get_thread_to_worker())
|
|
3576
|
-
if worker_id >= len(self.workers):
|
|
3577
|
-
raise RuntimeError("[Internal] worker_id value is greater than number of available workers!")
|
|
3578
|
-
|
|
3579
|
-
# todo check_iterator_cleanup
|
|
3580
|
-
if self.is_running() and check_iterator_cleanup() is False:
|
|
3581
|
-
if self.start_method == "fork":
|
|
3582
|
-
return self.workers[worker_id].execute(idx, *args)
|
|
3583
|
-
# spawn mode
|
|
3584
|
-
self.pipes[worker_id].master_send(idx, args)
|
|
3585
|
-
time_s = time.time()
|
|
3586
|
-
wait_count = 1
|
|
3587
|
-
while True:
|
|
3588
|
-
cost_time = time.time() - time_s
|
|
3589
|
-
if cost_time / self.check_interval >= wait_count:
|
|
3590
|
-
wait_count += 1
|
|
3591
|
-
logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
|
|
3592
|
-
"worker of the map operation is hanging. "
|
|
3593
|
-
"Check whether the user defined data transform is too slow or the "
|
|
3594
|
-
"output data is too large. You can also set the timeout interval by "
|
|
3595
|
-
"ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
|
|
3596
|
-
"of this log.")
|
|
3597
|
-
pid = self.workers[worker_id].pid
|
|
3598
|
-
logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
|
|
3599
|
-
install_status, _ = subprocess.getstatusoutput("py-spy --version")
|
|
3600
|
-
if install_status == 0:
|
|
3601
|
-
stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
|
|
3602
|
-
logger.warning("Map worker subprocess stack:\n{}".format(stack))
|
|
3603
|
-
else:
|
|
3604
|
-
logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
|
|
3605
|
-
try:
|
|
3606
|
-
res = self.pipes[worker_id].master_receive()
|
|
3607
|
-
except queue.Empty:
|
|
3608
|
-
continue
|
|
3609
|
-
if res is None:
|
|
3610
|
-
# receive finish signal
|
|
3611
|
-
return None
|
|
3612
|
-
if isinstance(res, ExceptionHandler):
|
|
3613
|
-
res.reraise()
|
|
3614
|
-
return res
|
|
3606
|
+
self.launch(self.op_id, op_type, ftok_keys)
|
|
3615
3607
|
|
|
3616
|
-
|
|
3608
|
+
def is_mp_enabled(self):
|
|
3609
|
+
return self.workers is not None
|
|
3617
3610
|
|
|
3618
3611
|
def _launch_monitor(self):
|
|
3619
3612
|
"""
|
|
@@ -3622,26 +3615,28 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3622
3615
|
The watch dog will clean up subprocesses and main process when any subprocess exited.
|
|
3623
3616
|
"""
|
|
3624
3617
|
if platform.system().lower() != 'windows':
|
|
3625
|
-
self.
|
|
3618
|
+
self.eof_clean_process = multiprocessing.Event()
|
|
3626
3619
|
self.cleaning_process = multiprocessing.Process(target=self._clean_process,
|
|
3627
3620
|
name="MapCleanProcess",
|
|
3628
|
-
args=(self.ppid, self.workers, self.
|
|
3621
|
+
args=(self.ppid, self.workers, self.eof_clean_process),
|
|
3629
3622
|
daemon=True)
|
|
3630
3623
|
self.cleaning_process.start()
|
|
3631
3624
|
logger.info("Launch clean process {} to monitor worker "
|
|
3632
3625
|
"process(es): {}".format(self.cleaning_process.pid, self.get_pids()))
|
|
3633
3626
|
|
|
3634
3627
|
if get_enable_watchdog():
|
|
3635
|
-
worker_ids = [
|
|
3628
|
+
worker_ids = [os.getpid()]
|
|
3629
|
+
worker_ids.extend([worker.pid for worker in self.workers])
|
|
3636
3630
|
worker_ids.append(self.cleaning_process.pid)
|
|
3637
|
-
cde.register_worker_pids(id(self),
|
|
3631
|
+
cde.register_worker_pids(id(self), worker_ids)
|
|
3638
3632
|
|
|
3639
3633
|
def _abort_monitor(self):
|
|
3640
3634
|
"""Deregister workers monitored by the watch dog and join clean process."""
|
|
3641
3635
|
if get_enable_watchdog():
|
|
3642
3636
|
cde.deregister_worker_pids(id(self))
|
|
3643
|
-
if hasattr(self, 'eof') and self.
|
|
3644
|
-
|
|
3637
|
+
if hasattr(self, 'eof') and self.eof_clean_process is not None:
|
|
3638
|
+
logger.info("Set eof flag for cleaning_process.")
|
|
3639
|
+
self.eof_clean_process.set()
|
|
3645
3640
|
if hasattr(self, 'cleaning_process') and self.cleaning_process is not None:
|
|
3646
3641
|
# let the quit event notify the cleaning process to exit
|
|
3647
3642
|
self.cleaning_process.join(timeout=5)
|
|
@@ -3652,20 +3647,14 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3652
3647
|
|
|
3653
3648
|
def is_running(self):
|
|
3654
3649
|
if hasattr(self, 'workers') and self.workers is not None:
|
|
3655
|
-
if self.start_method == "fork":
|
|
3656
|
-
return all([w.is_alive() for w in self.workers])
|
|
3657
3650
|
return all([worker_is_alive(w) for w in self.workers])
|
|
3658
3651
|
return False
|
|
3659
3652
|
|
|
3660
3653
|
def close_all_workers(self):
|
|
3661
3654
|
"""Close all the subprocess workers"""
|
|
3662
3655
|
if hasattr(self, 'workers') and self.workers is not None:
|
|
3663
|
-
|
|
3664
|
-
|
|
3665
|
-
w.close()
|
|
3666
|
-
else:
|
|
3667
|
-
for i, w in enumerate(self.workers):
|
|
3668
|
-
close_worker(w, self.pipes[i])
|
|
3656
|
+
for index in range(len(self.workers)):
|
|
3657
|
+
close_worker(self.workers[index], self.eof_workers[index])
|
|
3669
3658
|
|
|
3670
3659
|
check_interval = get_multiprocessing_timeout_interval()
|
|
3671
3660
|
for w in self.workers:
|
|
@@ -3682,12 +3671,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3682
3671
|
continue
|
|
3683
3672
|
raise e
|
|
3684
3673
|
try:
|
|
3685
|
-
if
|
|
3686
|
-
|
|
3687
|
-
os.close(subprocess_file_descriptor)
|
|
3688
|
-
else:
|
|
3689
|
-
if worker_is_alive(w):
|
|
3690
|
-
os.close(subprocess_file_descriptor)
|
|
3674
|
+
if worker_is_alive(w):
|
|
3675
|
+
os.close(subprocess_file_descriptor)
|
|
3691
3676
|
except OSError as e:
|
|
3692
3677
|
# Maybe the file descriptor had been released, so ignore the 'Bad file descriptor'
|
|
3693
3678
|
if "Bad file descriptor" not in str(e):
|
|
@@ -3696,8 +3681,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3696
3681
|
# use clear to release the handle which is better than self.workers = None
|
|
3697
3682
|
self.workers.clear()
|
|
3698
3683
|
self.workers = None
|
|
3699
|
-
self.
|
|
3700
|
-
self.
|
|
3684
|
+
self.eof_workers.clear()
|
|
3685
|
+
self.eof_workers = []
|
|
3686
|
+
|
|
3687
|
+
# as it can cause the main process to not exit when PyFunc executes very slowly so release
|
|
3688
|
+
# the shm & msg here
|
|
3689
|
+
cde.release_shm_and_msg_by_worker_pids(self.pids)
|
|
3701
3690
|
self.pids = None
|
|
3702
3691
|
|
|
3703
3692
|
|
|
@@ -3775,7 +3764,22 @@ class MapDataset(UnionBaseDataset):
|
|
|
3775
3764
|
|
|
3776
3765
|
count_old_transforms, count_new_transforms, count_non_data_vision_transforms = \
|
|
3777
3766
|
self.__count_transforms(operations)
|
|
3767
|
+
count_py_ops = self.__count_py_ops(operations)
|
|
3778
3768
|
count_pyfunc = self.__count_pyfuncs(operations)
|
|
3769
|
+
|
|
3770
|
+
# Whether to execute ops in the thread mode
|
|
3771
|
+
# op_type python_multiprocessing run_in_thread
|
|
3772
|
+
# c_op(s) false yes
|
|
3773
|
+
# c_op(s) true yes
|
|
3774
|
+
# py_op(s) / PyFunc false yes
|
|
3775
|
+
# py_op(s) / PyFunc true no
|
|
3776
|
+
# c_op(s) + py_op(s) / PyFunc false yes
|
|
3777
|
+
# c_op(s) + py_op(s) / PyFunc true no
|
|
3778
|
+
run_in_thread = not self.python_multiprocessing or (count_pyfunc == 0 and count_py_ops == 0) or get_debug_mode()
|
|
3779
|
+
|
|
3780
|
+
if self.python_multiprocessing and platform.system().lower() == 'windows':
|
|
3781
|
+
run_in_thread = True
|
|
3782
|
+
|
|
3779
3783
|
if count_new_transforms + count_pyfunc == len(operations):
|
|
3780
3784
|
prev_op = None
|
|
3781
3785
|
for op in operations:
|
|
@@ -3793,18 +3797,43 @@ class MapDataset(UnionBaseDataset):
|
|
|
3793
3797
|
op.implementation = Implementation.C
|
|
3794
3798
|
prev_op = op
|
|
3795
3799
|
operations = self.__insert_debug_wrapper(operations)
|
|
3796
|
-
|
|
3800
|
+
if run_in_thread:
|
|
3801
|
+
operations = transforms.transforms.Compose.reduce(operations)
|
|
3797
3802
|
elif count_old_transforms + count_pyfunc + count_non_data_vision_transforms == len(operations):
|
|
3798
3803
|
operations = self.__insert_debug_wrapper(operations)
|
|
3799
|
-
|
|
3804
|
+
if run_in_thread:
|
|
3805
|
+
operations = transforms.py_transforms.Compose.reduce(operations)
|
|
3800
3806
|
else:
|
|
3801
3807
|
raise RuntimeError("Mixing old legacy c/py_transforms and new unified transforms is not allowed.")
|
|
3802
3808
|
|
|
3803
|
-
|
|
3809
|
+
if run_in_thread:
|
|
3810
|
+
self.operations = self.__process_final_operations(operations)
|
|
3811
|
+
else:
|
|
3812
|
+
self.operations = operations
|
|
3804
3813
|
self.prepare_multiprocessing()
|
|
3805
3814
|
|
|
3806
3815
|
callbacks = [cb.create_runtime_obj() for cb in self.callbacks]
|
|
3807
|
-
|
|
3816
|
+
|
|
3817
|
+
## thread mode
|
|
3818
|
+
if run_in_thread:
|
|
3819
|
+
return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
|
|
3820
|
+
callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
|
|
3821
|
+
|
|
3822
|
+
# Bind self.operations with self.process_pool
|
|
3823
|
+
class _BindProcessPoolWithOperations:
|
|
3824
|
+
def __init__(self, pool, operations):
|
|
3825
|
+
self.pool = pool
|
|
3826
|
+
self.operations = operations
|
|
3827
|
+
|
|
3828
|
+
def __call__(self):
|
|
3829
|
+
pass
|
|
3830
|
+
|
|
3831
|
+
self.bound = _BindProcessPoolWithOperations(self.process_pool, self.operations)
|
|
3832
|
+
|
|
3833
|
+
## process mode
|
|
3834
|
+
# in multi process mode, we just transfer the self.bound which is not really used in c layer
|
|
3835
|
+
# because when the pipeline is running, map thread transfer data through c++ shm & msg to Python Worker Process
|
|
3836
|
+
return cde.MapNode(children[0], [self.bound], self.input_columns, self.output_columns,
|
|
3808
3837
|
callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
|
|
3809
3838
|
|
|
3810
3839
|
def __deepcopy__(self, memodict):
|
|
@@ -3857,10 +3886,22 @@ class MapDataset(UnionBaseDataset):
|
|
|
3857
3886
|
@staticmethod
|
|
3858
3887
|
def __count_pyfuncs(operations):
|
|
3859
3888
|
"""
|
|
3860
|
-
Count the number of pyfuncs operations
|
|
3889
|
+
Count the number of pyfuncs operations which is defined by user
|
|
3861
3890
|
"""
|
|
3862
3891
|
return sum([1 if isinstance(op, FuncWrapper) else 0 for op in operations])
|
|
3863
3892
|
|
|
3893
|
+
@staticmethod
|
|
3894
|
+
def __count_py_ops(operations):
|
|
3895
|
+
"""
|
|
3896
|
+
Count the number of python operations which is built-in
|
|
3897
|
+
"""
|
|
3898
|
+
count = 0
|
|
3899
|
+
for op in operations:
|
|
3900
|
+
if hasattr(op, "implementation") and op.implementation != Implementation.C \
|
|
3901
|
+
and op.implementation is not None:
|
|
3902
|
+
count += 1
|
|
3903
|
+
return count
|
|
3904
|
+
|
|
3864
3905
|
@staticmethod
|
|
3865
3906
|
def __count_transforms(operations):
|
|
3866
3907
|
"""
|
|
@@ -3924,7 +3965,6 @@ class MapDataset(UnionBaseDataset):
|
|
|
3924
3965
|
" Ignoring Python multiprocessing for map operation.")
|
|
3925
3966
|
return
|
|
3926
3967
|
if self.python_multiprocessing:
|
|
3927
|
-
iter_specific_operations = []
|
|
3928
3968
|
callable_list = []
|
|
3929
3969
|
|
|
3930
3970
|
# If user didn't specify num_parallel_workers, set it to default
|
|
@@ -3941,18 +3981,6 @@ class MapDataset(UnionBaseDataset):
|
|
|
3941
3981
|
self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(),
|
|
3942
3982
|
self.num_parallel_workers, str(self),
|
|
3943
3983
|
callable_list, self.max_rowsize)
|
|
3944
|
-
# Pass #2
|
|
3945
|
-
idx = 0
|
|
3946
|
-
for op in self.operations:
|
|
3947
|
-
# our c transforms is now callable and should not be run in Python multithreading
|
|
3948
|
-
if MapDataset.__operation_valid_for_multiprocessing(op):
|
|
3949
|
-
# Wrap Python callable into _PythonCallable
|
|
3950
|
-
iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))
|
|
3951
|
-
idx += 1
|
|
3952
|
-
else:
|
|
3953
|
-
# CPP ops remain the same
|
|
3954
|
-
iter_specific_operations.append(op)
|
|
3955
|
-
self.operations = iter_specific_operations
|
|
3956
3984
|
|
|
3957
3985
|
def __insert_debug_wrapper(self, operations):
|
|
3958
3986
|
"""
|
|
@@ -4385,7 +4413,7 @@ class TransferDataset(Dataset):
|
|
|
4385
4413
|
def create_dict_iterator(self, num_epochs=-1, output_numpy=False):
|
|
4386
4414
|
raise RuntimeError("TransferDataset is not iterable.")
|
|
4387
4415
|
|
|
4388
|
-
def create_tuple_iterator(self, columns=None, num_epochs=-1, output_numpy=False, do_copy=
|
|
4416
|
+
def create_tuple_iterator(self, columns=None, num_epochs=-1, output_numpy=False, do_copy=False):
|
|
4389
4417
|
raise RuntimeError("TransferDataset is not iterable.")
|
|
4390
4418
|
|
|
4391
4419
|
def __iter__(self):
|