mindspore 1.10.0__cp37-cp37m-win_amd64.whl → 2.0.0rc1__cp37-cp37m-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/ConcurrencyCheck.dll +0 -0
- mindspore/CppBuildInsights.dll +0 -0
- mindspore/CppCoreCheck.dll +0 -0
- mindspore/EnumIndex.dll +0 -0
- mindspore/EspXEngine.dll +0 -0
- mindspore/HResultCheck.dll +0 -0
- mindspore/KernelTraceControl.dll +0 -0
- mindspore/LocalESPC.dll +0 -0
- mindspore/Microsoft.Diagnostics.Tracing.EventSource.dll +0 -0
- mindspore/Microsoft.VisualStudio.RemoteControl.dll +0 -0
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Microsoft.VisualStudio.Utilities.Internal.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/System.Runtime.CompilerServices.Unsafe.dll +0 -0
- mindspore/VariantClear.dll +0 -0
- mindspore/__init__.py +9 -4
- mindspore/_c_dataengine.cp37-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp37-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp37-win_amd64.pyd +0 -0
- mindspore/_check_jit_forbidden_api.py +102 -0
- mindspore/_checkparam.py +1066 -1001
- mindspore/_extends/builtin_operations.py +32 -4
- mindspore/_extends/graph_kernel/model/graph_split.py +66 -222
- mindspore/_extends/parallel_compile/akg_compiler/akg_process.py +12 -9
- mindspore/_extends/parallel_compile/akg_compiler/build_tbe_kernel.py +119 -26
- mindspore/_extends/parallel_compile/akg_compiler/tbe_topi.py +50 -50
- mindspore/_extends/parallel_compile/akg_compiler/util.py +9 -6
- mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py +4 -25
- mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +9 -4
- mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py +1 -27
- mindspore/_extends/parse/__init__.py +5 -3
- mindspore/_extends/parse/namespace.py +17 -2
- mindspore/_extends/parse/parser.py +193 -34
- mindspore/_extends/parse/resources.py +7 -8
- mindspore/_extends/parse/standard_method.py +1780 -435
- mindspore/_extends/parse/trope.py +3 -1
- mindspore/amp.py +53 -58
- mindspore/atlprov.dll +0 -0
- mindspore/boost/adasum.py +3 -2
- mindspore/boost/boost.py +2 -2
- mindspore/boost/boost_cell_wrapper.py +46 -26
- mindspore/boost/dim_reduce.py +6 -5
- mindspore/boost/grad_accumulation.py +2 -1
- mindspore/boost/group_loss_scale_manager.py +1 -1
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/cfgpersist.dll +0 -0
- mindspore/clang_rt.asan_dbg_dynamic-x86_64.dll +0 -0
- mindspore/clang_rt.asan_dynamic-x86_64.dll +0 -0
- mindspore/common/__init__.py +11 -10
- mindspore/common/_decorator.py +2 -0
- mindspore/common/_register_for_adapter.py +55 -0
- mindspore/common/_stub_tensor.py +201 -0
- mindspore/common/_utils.py +57 -0
- mindspore/common/api.py +582 -297
- mindspore/common/dtype.py +66 -18
- mindspore/common/dump.py +2 -2
- mindspore/common/initializer.py +38 -1
- mindspore/common/jit_config.py +25 -13
- mindspore/common/mutable.py +53 -24
- mindspore/common/parameter.py +60 -37
- mindspore/common/seed.py +8 -24
- mindspore/common/sparse_tensor.py +927 -0
- mindspore/common/tensor.py +1627 -3900
- mindspore/communication/__init__.py +10 -5
- mindspore/communication/_comm_helper.py +78 -214
- mindspore/communication/_hccl_management.py +2 -1
- mindspore/communication/management.py +136 -47
- mindspore/config/op_info.config +501 -1008
- mindspore/context.py +291 -56
- mindspore/d3dcompiler_47.dll +0 -0
- mindspore/dataset/__init__.py +12 -8
- mindspore/dataset/audio/__init__.py +9 -9
- mindspore/dataset/audio/transforms.py +1090 -228
- mindspore/dataset/audio/utils.py +87 -39
- mindspore/dataset/audio/validators.py +223 -1
- mindspore/dataset/callback/ds_callback.py +17 -15
- mindspore/dataset/core/config.py +246 -17
- mindspore/dataset/core/py_util_helpers.py +4 -3
- mindspore/dataset/core/validator_helpers.py +10 -10
- mindspore/{parallel/nn/layers.py → dataset/debug/__init__.py} +7 -8
- mindspore/dataset/debug/debug_hook.py +65 -0
- mindspore/dataset/debug/pre_defined_hook.py +67 -0
- mindspore/dataset/engine/__init__.py +7 -3
- mindspore/dataset/engine/cache_client.py +9 -9
- mindspore/dataset/engine/datasets.py +648 -477
- mindspore/dataset/engine/datasets_audio.py +165 -167
- mindspore/dataset/engine/datasets_standard_format.py +93 -67
- mindspore/dataset/engine/datasets_text.py +492 -342
- mindspore/dataset/engine/datasets_user_defined.py +85 -50
- mindspore/dataset/engine/datasets_vision.py +1224 -699
- mindspore/dataset/engine/graphdata.py +134 -69
- mindspore/dataset/engine/iterators.py +50 -9
- mindspore/dataset/engine/offload.py +52 -31
- mindspore/dataset/engine/samplers.py +27 -24
- mindspore/dataset/engine/serializer_deserializer.py +14 -15
- mindspore/dataset/engine/validators.py +213 -52
- mindspore/dataset/text/__init__.py +10 -8
- mindspore/dataset/text/transforms.py +152 -57
- mindspore/dataset/text/utils.py +98 -49
- mindspore/dataset/text/validators.py +25 -0
- mindspore/dataset/transforms/__init__.py +4 -2
- mindspore/dataset/transforms/c_transforms.py +11 -13
- mindspore/dataset/transforms/py_transforms.py +2 -2
- mindspore/dataset/transforms/py_transforms_util.py +10 -0
- mindspore/dataset/transforms/transforms.py +13 -15
- mindspore/dataset/transforms/validators.py +7 -7
- mindspore/dataset/utils/__init__.py +2 -1
- mindspore/dataset/utils/browse_dataset.py +13 -13
- mindspore/dataset/utils/line_reader.py +121 -0
- mindspore/dataset/vision/__init__.py +8 -7
- mindspore/dataset/vision/c_transforms.py +125 -126
- mindspore/dataset/vision/py_transforms.py +37 -37
- mindspore/dataset/vision/py_transforms_util.py +23 -20
- mindspore/dataset/vision/transforms.py +316 -315
- mindspore/dataset/vision/utils.py +313 -17
- mindspore/dataset/vision/validators.py +6 -6
- mindspore/default_config.py +0 -1
- mindspore/dpcmi.dll +0 -0
- mindspore/{compression → experimental}/__init__.py +6 -5
- mindspore/experimental/map_parameter.py +275 -0
- mindspore/include/OWNERS +0 -1
- mindspore/include/api/callback/callback.h +9 -13
- mindspore/include/api/callback/ckpt_saver.h +2 -2
- mindspore/include/api/callback/loss_monitor.h +2 -2
- mindspore/include/api/callback/lr_scheduler.h +5 -5
- mindspore/include/api/callback/time_monitor.h +2 -2
- mindspore/include/api/callback/train_accuracy.h +4 -6
- mindspore/include/api/cfg.h +19 -6
- mindspore/include/api/context.h +70 -9
- mindspore/include/api/delegate.h +8 -1
- mindspore/include/api/dual_abi_helper.h +8 -24
- mindspore/include/api/metrics/accuracy.h +2 -2
- mindspore/include/api/metrics/metrics.h +4 -3
- mindspore/include/api/model.h +9 -4
- mindspore/include/api/model_group.h +68 -0
- mindspore/include/api/model_parallel_runner.h +17 -17
- mindspore/include/api/net.h +12 -11
- mindspore/include/api/serialization.h +20 -4
- mindspore/include/api/status.h +7 -1
- mindspore/include/api/types.h +25 -21
- mindspore/include/api/visible.h +4 -0
- mindspore/include/c_api/model_c.h +5 -0
- mindspore/include/c_api/status_c.h +1 -1
- mindspore/include/dataset/config.h +1 -1
- mindspore/include/dataset/constants.h +14 -0
- mindspore/include/dataset/text.h +59 -0
- mindspore/include/dataset/vision.h +56 -117
- mindspore/include/dataset/vision_lite.h +102 -0
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +28 -28
- mindspore/mindrecord/common/exceptions.py +2 -4
- mindspore/mindrecord/filereader.py +19 -1
- mindspore/mindrecord/filewriter.py +250 -88
- mindspore/mindrecord/mindpage.py +13 -13
- mindspore/mindrecord/shardheader.py +15 -15
- mindspore/mindrecord/shardreader.py +9 -0
- mindspore/mindrecord/shardwriter.py +29 -29
- mindspore/mindrecord/tools/cifar100_to_mr.py +9 -9
- mindspore/mindrecord/tools/cifar10_to_mr.py +9 -9
- mindspore/mindrecord/tools/csv_to_mr.py +4 -4
- mindspore/mindrecord/tools/imagenet_to_mr.py +70 -65
- mindspore/mindrecord/tools/mnist_to_mr.py +41 -41
- mindspore/mindrecord/tools/tfrecord_to_mr.py +6 -6
- mindspore/{libmindspore_backend.dll → mindspore_backend.dll} +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_shared_lib.dll +0 -0
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/nn/__init__.py +1 -5
- mindspore/nn/cell.py +297 -234
- mindspore/nn/dynamic_lr.py +1 -1
- mindspore/nn/grad/cell_grad.py +17 -42
- mindspore/nn/layer/__init__.py +7 -4
- mindspore/nn/layer/activation.py +131 -88
- mindspore/nn/layer/basic.py +313 -613
- mindspore/nn/layer/channel_shuffle.py +103 -0
- mindspore/nn/layer/combined.py +1 -1
- mindspore/nn/layer/container.py +52 -6
- mindspore/nn/layer/conv.py +112 -43
- mindspore/nn/layer/dense.py +10 -9
- mindspore/nn/layer/embedding.py +36 -34
- mindspore/nn/layer/image.py +123 -27
- mindspore/nn/layer/math.py +108 -107
- mindspore/nn/layer/normalization.py +212 -366
- mindspore/nn/layer/padding.py +370 -42
- mindspore/nn/layer/pooling.py +1443 -219
- mindspore/nn/layer/rnn_cells.py +11 -16
- mindspore/nn/layer/rnns.py +38 -39
- mindspore/nn/layer/thor_layer.py +24 -25
- mindspore/nn/layer/timedistributed.py +5 -5
- mindspore/nn/layer/transformer.py +701 -0
- mindspore/nn/learning_rate_schedule.py +8 -8
- mindspore/nn/loss/__init__.py +9 -6
- mindspore/nn/loss/loss.py +678 -142
- mindspore/nn/metrics.py +53 -0
- mindspore/nn/optim/_dist_optimizer_registry.py +2 -2
- mindspore/nn/optim/ada_grad.py +8 -8
- mindspore/nn/optim/adadelta.py +2 -3
- mindspore/nn/optim/adafactor.py +18 -14
- mindspore/nn/optim/adam.py +429 -87
- mindspore/nn/optim/adamax.py +5 -6
- mindspore/nn/optim/adasum.py +10 -8
- mindspore/nn/optim/asgd.py +7 -7
- mindspore/nn/optim/ftrl.py +81 -11
- mindspore/nn/optim/lamb.py +7 -8
- mindspore/nn/optim/lars.py +4 -4
- mindspore/nn/optim/lazyadam.py +82 -7
- mindspore/nn/optim/momentum.py +8 -7
- mindspore/nn/optim/optimizer.py +19 -10
- mindspore/nn/optim/proximal_ada_grad.py +6 -5
- mindspore/nn/optim/rmsprop.py +3 -3
- mindspore/nn/optim/rprop.py +20 -16
- mindspore/nn/optim/sgd.py +21 -15
- mindspore/nn/optim/thor.py +23 -21
- mindspore/nn/probability/__init__.py +0 -2
- mindspore/nn/probability/bijector/bijector.py +7 -6
- mindspore/nn/probability/bijector/invert.py +4 -2
- mindspore/nn/probability/bijector/softplus.py +2 -2
- mindspore/nn/probability/bnn_layers/dense_variational.py +1 -1
- mindspore/nn/probability/bnn_layers/layer_distribution.py +2 -2
- mindspore/nn/probability/distribution/__init__.py +6 -0
- mindspore/nn/probability/distribution/_utils/custom_ops.py +3 -2
- mindspore/nn/probability/distribution/_utils/utils.py +11 -17
- mindspore/nn/probability/distribution/bernoulli.py +6 -6
- mindspore/nn/probability/distribution/beta.py +1 -1
- mindspore/nn/probability/distribution/categorical.py +9 -9
- mindspore/nn/probability/distribution/cauchy.py +8 -8
- mindspore/nn/probability/distribution/distribution.py +12 -6
- mindspore/nn/probability/distribution/exponential.py +5 -5
- mindspore/nn/probability/distribution/gamma.py +3 -3
- mindspore/nn/probability/distribution/geometric.py +6 -5
- mindspore/nn/probability/distribution/gumbel.py +5 -5
- mindspore/nn/probability/distribution/half_normal.py +133 -0
- mindspore/nn/probability/distribution/laplace.py +128 -0
- mindspore/nn/probability/distribution/log_normal.py +0 -1
- mindspore/nn/probability/distribution/logistic.py +4 -5
- mindspore/nn/probability/distribution/normal.py +11 -15
- mindspore/nn/probability/distribution/poisson.py +6 -2
- mindspore/nn/probability/distribution/student_t.py +150 -0
- mindspore/nn/probability/distribution/transformed_distribution.py +4 -4
- mindspore/nn/probability/distribution/uniform.py +5 -5
- mindspore/nn/reinforcement/_tensors_queue.py +3 -3
- mindspore/nn/reinforcement/tensor_array.py +2 -2
- mindspore/nn/sparse/sparse.py +8 -1
- mindspore/nn/wrap/cell_wrapper.py +55 -27
- mindspore/nn/wrap/grad_reducer.py +20 -11
- mindspore/nn/wrap/loss_scale.py +47 -30
- mindspore/numpy/array_creations.py +33 -22
- mindspore/numpy/array_ops.py +46 -42
- mindspore/numpy/logic_ops.py +6 -27
- mindspore/numpy/math_ops.py +26 -19
- mindspore/numpy/utils.py +1 -8
- mindspore/numpy/utils_const.py +112 -62
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +6 -3
- mindspore/ops/_constants.py +0 -6
- mindspore/ops/_grad/__init__.py +2 -1
- mindspore/ops/_grad/grad_array_ops.py +209 -152
- mindspore/ops/_grad/grad_base.py +55 -17
- mindspore/ops/_grad/grad_clip_ops.py +11 -3
- mindspore/ops/_grad/grad_comm_ops.py +58 -47
- mindspore/ops/_grad/grad_implementations.py +21 -61
- mindspore/ops/_grad/grad_inner_ops.py +48 -6
- mindspore/ops/_grad/grad_math_ops.py +306 -161
- mindspore/ops/_grad/grad_nn_ops.py +192 -181
- mindspore/ops/_grad/grad_other_ops.py +1 -1
- mindspore/ops/_grad/grad_quant_ops.py +5 -5
- mindspore/ops/_grad/grad_sequence_ops.py +296 -0
- mindspore/ops/_grad/grad_sparse.py +15 -9
- mindspore/ops/_grad_experimental/__init__.py +1 -0
- mindspore/ops/_grad_experimental/grad_array_ops.py +441 -55
- mindspore/ops/_grad_experimental/grad_image_ops.py +25 -7
- mindspore/ops/_grad_experimental/grad_inner_ops.py +3 -44
- mindspore/ops/_grad_experimental/grad_linalg_ops.py +16 -21
- mindspore/ops/_grad_experimental/grad_math_ops.py +979 -49
- mindspore/ops/_grad_experimental/grad_nn_ops.py +78 -8
- mindspore/ops/_grad_experimental/grad_scalar_ops.py +112 -0
- mindspore/ops/_grad_experimental/grad_sparse_ops.py +197 -13
- mindspore/ops/_op_impl/__init__.py +3 -3
- mindspore/ops/_op_impl/_custom_op/__init__.py +0 -1
- mindspore/ops/_op_impl/_custom_op/_basic.py +0 -1
- mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/batchnorm_fold.py +4 -2
- mindspore/ops/_op_impl/_custom_op/batchnorm_fold2.py +2 -2
- mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad.py +2 -2
- mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py +5 -5
- mindspore/ops/_op_impl/_custom_op/batchnorm_fold_grad.py +3 -3
- mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/correction_mul.py +3 -3
- mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py +2 -2
- mindspore/ops/_op_impl/_custom_op/dsd_back_impl.py +4 -8
- mindspore/ops/_op_impl/_custom_op/dsd_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py +2 -2
- mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/img2col_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py +2 -2
- mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py +2 -2
- mindspore/ops/_op_impl/_custom_op/matmul_dds_grad_impl.py +0 -1
- mindspore/ops/_op_impl/_custom_op/matmul_dds_impl.py +0 -1
- mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py +1 -1
- mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py +2 -2
- mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py +2 -2
- mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py +1 -1
- mindspore/ops/_op_impl/aicpu/__init__.py +238 -3
- mindspore/ops/_op_impl/aicpu/abs.py +36 -0
- mindspore/ops/_op_impl/aicpu/adaptive_avg_pool_2d.py +34 -0
- mindspore/ops/_op_impl/aicpu/adaptive_avg_pool_2d_grad.py +34 -0
- mindspore/ops/_op_impl/aicpu/adaptive_avg_pool_3d.py +39 -0
- mindspore/ops/_op_impl/aicpu/adaptive_avg_pool_3d_grad.py +39 -0
- mindspore/ops/_op_impl/aicpu/adaptive_max_pool_2d_grad.py +37 -0
- mindspore/ops/_op_impl/aicpu/adaptive_max_pool_3d.py +42 -0
- mindspore/ops/_op_impl/aicpu/adaptive_max_pool_3d_grad.py +152 -0
- mindspore/ops/_op_impl/aicpu/add.py +43 -0
- mindspore/ops/_op_impl/aicpu/addcdiv.py +0 -32
- mindspore/ops/_op_impl/aicpu/addcmul.py +0 -84
- mindspore/ops/_op_impl/aicpu/affine_grid_grad.py +35 -0
- mindspore/ops/_op_impl/aicpu/arg_max.py +75 -0
- mindspore/ops/_op_impl/aicpu/arg_min.py +75 -0
- mindspore/ops/_op_impl/aicpu/argmin_with_value.py +43 -0
- mindspore/ops/_op_impl/aicpu/batch_matmul.py +43 -0
- mindspore/ops/_op_impl/aicpu/batch_norm_grad_grad.py +49 -0
- mindspore/ops/_op_impl/aicpu/bernoulli.py +48 -0
- mindspore/ops/_op_impl/aicpu/bessel_i0.py +31 -0
- mindspore/ops/_op_impl/aicpu/bias_add.py +44 -0
- mindspore/ops/_op_impl/aicpu/bias_add_grad.py +43 -0
- mindspore/ops/_op_impl/aicpu/bincount.py +33 -0
- mindspore/{nn/probability/infer/variational/__init__.py → ops/_op_impl/aicpu/cauchy.py} +17 -10
- mindspore/ops/_op_impl/aicpu/channel_shuffle.py +40 -0
- mindspore/ops/_op_impl/aicpu/cholesky.py +1 -1
- mindspore/ops/_op_impl/{cpu/bias_add.py → aicpu/choleskygrad.py} +9 -7
- mindspore/ops/_op_impl/aicpu/combined_non_max_suppression.py +42 -0
- mindspore/ops/_op_impl/aicpu/concat_offset.py +42 -0
- mindspore/ops/_op_impl/aicpu/concat_offset_v1.py +31 -0
- mindspore/ops/_op_impl/aicpu/conj.py +11 -0
- mindspore/ops/_op_impl/aicpu/crop_and_resize_grad_image.py +38 -0
- mindspore/ops/_op_impl/aicpu/cumulative_logsumexp.py +36 -0
- mindspore/ops/_op_impl/aicpu/deformable_offsets.py +38 -0
- mindspore/ops/_op_impl/aicpu/deformable_offsets_grad.py +2 -2
- mindspore/ops/_op_impl/aicpu/dense_to_sparse_set_operation.py +48 -0
- mindspore/ops/_op_impl/aicpu/diag.py +36 -0
- mindspore/ops/_op_impl/aicpu/diag_part.py +36 -0
- mindspore/ops/_op_impl/aicpu/diagonal.py +35 -0
- mindspore/ops/_op_impl/{cpu/bias_add_grad.py → aicpu/digamma.py} +9 -7
- mindspore/ops/_op_impl/aicpu/eig.py +35 -0
- mindspore/ops/_op_impl/aicpu/fft_with_size.py +41 -0
- mindspore/ops/_op_impl/aicpu/flatten.py +1 -0
- mindspore/ops/_op_impl/aicpu/fmax.py +36 -0
- mindspore/ops/_op_impl/aicpu/fmin.py +37 -0
- mindspore/ops/_op_impl/aicpu/fractional_max_pool3d_with_fixed_ksize.py +1 -1
- mindspore/ops/_op_impl/aicpu/fse_decode.py +43 -0
- mindspore/ops/_op_impl/aicpu/glu.py +33 -0
- mindspore/ops/_op_impl/aicpu/glu_grad.py +34 -0
- mindspore/ops/_op_impl/aicpu/greater.py +41 -0
- mindspore/ops/_op_impl/aicpu/greater_equal.py +41 -0
- mindspore/ops/_op_impl/aicpu/index_put.py +50 -0
- mindspore/ops/_op_impl/{tbe/scatter_add_ds.py → aicpu/inplace_index_add.py} +17 -21
- mindspore/ops/_op_impl/aicpu/instance_norm_v2.py +41 -0
- mindspore/ops/_op_impl/aicpu/instance_norm_v2_grad.py +44 -0
- mindspore/ops/_op_impl/aicpu/layer_norm_grad_grad.py +47 -0
- mindspore/ops/_op_impl/aicpu/less.py +41 -0
- mindspore/ops/_op_impl/aicpu/less_equal.py +41 -0
- mindspore/ops/_op_impl/aicpu/lgamma.py +32 -0
- mindspore/ops/_op_impl/aicpu/log_normal_reverse.py +33 -0
- mindspore/ops/_op_impl/aicpu/logit.py +33 -0
- mindspore/ops/_op_impl/aicpu/logit_grad.py +34 -0
- mindspore/ops/_op_impl/aicpu/masked_fill.py +42 -0
- mindspore/ops/_op_impl/aicpu/masked_scatter.py +39 -0
- mindspore/ops/_op_impl/aicpu/matmul.py +39 -0
- mindspore/ops/_op_impl/aicpu/matrix_logarithm.py +31 -0
- mindspore/ops/_op_impl/aicpu/matrix_power.py +32 -0
- mindspore/ops/_op_impl/aicpu/matrix_solve_ls.py +36 -0
- mindspore/ops/_op_impl/aicpu/matrix_triangular_solve.py +36 -0
- mindspore/ops/_op_impl/aicpu/mirror_pad.py +2 -0
- mindspore/ops/_op_impl/aicpu/mirror_pad_grad.py +0 -4
- mindspore/ops/_op_impl/aicpu/mul.py +3 -1
- mindspore/ops/_op_impl/aicpu/multinomial.py +14 -6
- mindspore/ops/_op_impl/aicpu/multinomial_with_replacement.py +35 -0
- mindspore/ops/_op_impl/aicpu/nan_to_num.py +34 -0
- mindspore/ops/_op_impl/aicpu/nllloss.py +38 -0
- mindspore/ops/_op_impl/aicpu/nllloss_grad.py +39 -0
- mindspore/ops/_op_impl/aicpu/ones_like.py +0 -2
- mindspore/ops/_op_impl/aicpu/polar.py +32 -0
- mindspore/ops/_op_impl/aicpu/polygamma.py +34 -0
- mindspore/ops/_op_impl/aicpu/qr.py +36 -0
- mindspore/ops/_op_impl/aicpu/quant_dtype_cast.py +40 -0
- mindspore/ops/_op_impl/aicpu/quantile.py +35 -0
- mindspore/ops/_op_impl/aicpu/ragged_tensor_to_sparse.py +73 -0
- mindspore/ops/_op_impl/aicpu/ragged_tensor_to_tensor.py +74 -0
- mindspore/ops/_op_impl/aicpu/random_shuffle.py +3 -0
- mindspore/ops/_op_impl/aicpu/randperm_v2.py +41 -0
- mindspore/ops/_op_impl/aicpu/range.py +36 -0
- mindspore/ops/_op_impl/aicpu/reciprocal.py +34 -0
- mindspore/ops/_op_impl/aicpu/reciprocal_grad.py +35 -0
- mindspore/ops/_op_impl/aicpu/reduce_sum.py +57 -0
- mindspore/ops/_op_impl/aicpu/resize_bicubic.py +2 -8
- mindspore/ops/_op_impl/aicpu/resize_bicubic_grad.py +1 -1
- mindspore/ops/_op_impl/aicpu/resize_v2.py +68 -0
- mindspore/ops/_op_impl/aicpu/resize_v2_grad.py +68 -0
- mindspore/ops/_op_impl/aicpu/scatter_elements.py +4 -0
- mindspore/ops/_op_impl/aicpu/scatter_nd_update.py +2 -0
- mindspore/ops/_op_impl/aicpu/search_sorted.py +12 -6
- mindspore/ops/_op_impl/aicpu/self_adjoint_eig.py +34 -0
- mindspore/ops/_op_impl/aicpu/sequence_add.py +34 -0
- mindspore/ops/_op_impl/aicpu/sequence_add_offset.py +34 -0
- mindspore/ops/_op_impl/aicpu/sequence_addn.py +38 -0
- mindspore/ops/_op_impl/aicpu/slice_grad.py +76 -0
- mindspore/ops/_op_impl/aicpu/smooth_l1_loss.py +35 -0
- mindspore/ops/_op_impl/aicpu/smooth_l1_loss_grad.py +37 -0
- mindspore/ops/_op_impl/aicpu/sort.py +39 -0
- mindspore/ops/_op_impl/aicpu/sparse_apply_adagrad_da.py +0 -24
- mindspore/ops/_op_impl/aicpu/sparse_cross.py +42 -0
- mindspore/ops/_op_impl/aicpu/sparse_fill_empty_rows.py +63 -0
- mindspore/ops/_op_impl/aicpu/sparse_fill_empty_rows_grad.py +45 -0
- mindspore/ops/_op_impl/aicpu/sparse_matrix_mat_mul.py +56 -0
- mindspore/ops/_op_impl/{tbe/slice_ds.py → aicpu/sparse_segment_sum.py} +16 -24
- mindspore/ops/_op_impl/aicpu/sparse_segment_sum_with_num_segments.py +68 -0
- mindspore/ops/_op_impl/aicpu/sparse_slice.py +63 -0
- mindspore/ops/_op_impl/aicpu/sparse_slice_grad.py +61 -0
- mindspore/ops/_op_impl/aicpu/squared_difference.py +2 -0
- mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +93 -0
- mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +66 -0
- mindspore/ops/_op_impl/aicpu/tensor_scatter_update.py +59 -0
- mindspore/ops/_op_impl/{tbe/gather_v2.py → aicpu/tile.py} +24 -24
- mindspore/ops/_op_impl/aicpu/tridiagonal_solve.py +35 -0
- mindspore/ops/_op_impl/aicpu/tril_indices.py +34 -0
- mindspore/ops/_op_impl/aicpu/triu_indices.py +34 -0
- mindspore/ops/_op_impl/aicpu/uniform.py +34 -0
- mindspore/ops/_op_impl/aicpu/uniform_candidate_sampler.py +1 -0
- mindspore/ops/_op_impl/aicpu/unique_consecutive.py +10 -2
- mindspore/ops/_op_impl/cpu/__init__.py +1 -2
- mindspore/ops/_op_impl/cpu/dynamic_shape.py +5 -1
- mindspore/ops/_op_impl/cpu/maximum_grad.py +2 -0
- mindspore/{compression/common/__init__.py → ops/_op_impl/cpu/pyexecute.py} +13 -8
- mindspore/ops/_op_impl/cpu/reduce_sum.py +8 -0
- mindspore/ops/_op_impl/cpu/sparse_slice.py +62 -0
- mindspore/ops/_op_impl/cpu/sparse_slice_grad.py +60 -0
- mindspore/ops/_op_impl/cpu/tensor_shape.py +5 -1
- mindspore/ops/_op_impl/tbe/__init__.py +27 -608
- mindspore/ops/_op_impl/tbe/addcdiv_ds.py +42 -0
- mindspore/ops/_op_impl/tbe/addcmul_ds.py +44 -0
- mindspore/ops/_op_impl/tbe/assign_add_ds.py +1 -0
- mindspore/ops/_op_impl/tbe/atomic_addr_clean.py +1 -1
- mindspore/ops/_op_impl/tbe/avg_pool_3d_grad.py +1 -1
- mindspore/ops/_op_impl/tbe/basic_lstm_cell_c_state_grad_v2.py +0 -1
- mindspore/ops/_op_impl/tbe/batch_to_space.py +1 -1
- mindspore/ops/_op_impl/tbe/batch_to_space_nd.py +1 -1
- mindspore/ops/_op_impl/tbe/batch_to_space_nd_v2.py +41 -0
- mindspore/ops/_op_impl/tbe/bce_with_logits_loss.py +1 -0
- mindspore/ops/_op_impl/tbe/bias_add_grad.py +2 -0
- mindspore/ops/_op_impl/tbe/bn_infer_grad.py +4 -2
- mindspore/ops/_op_impl/tbe/bn_infer_grad_ds.py +40 -0
- mindspore/ops/_op_impl/tbe/bn_training_update.py +0 -1
- mindspore/ops/_op_impl/tbe/bn_training_update_ds.py +0 -1
- mindspore/ops/_op_impl/tbe/broadcast_to_ds.py +6 -4
- mindspore/ops/_op_impl/tbe/cast.py +0 -2
- mindspore/ops/_op_impl/tbe/cast_ds.py +3 -3
- mindspore/ops/_op_impl/tbe/ctc_loss_v2.py +0 -2
- mindspore/ops/_op_impl/tbe/ctc_loss_v2_grad.py +0 -2
- mindspore/ops/_op_impl/tbe/data_format_dim_map_ds.py +1 -0
- mindspore/ops/_op_impl/tbe/deformable_offsets.py +1 -0
- mindspore/ops/_op_impl/tbe/depthwise_conv2d.py +1 -1
- mindspore/ops/_op_impl/tbe/dynamic_atomic_addr_clean.py +1 -1
- mindspore/ops/_op_impl/tbe/gather_nd.py +1 -0
- mindspore/ops/_op_impl/tbe/greater.py +2 -0
- mindspore/ops/_op_impl/tbe/{index_add.py → inplace_index_add.py} +3 -6
- mindspore/ops/_op_impl/tbe/layer_norm_beta_gamma_backprop_v2.py +0 -1
- mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py +35 -0
- mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py +35 -0
- mindspore/ops/_op_impl/tbe/one_hot_ds.py +0 -6
- mindspore/ops/_op_impl/tbe/{greater_ds.py → reduce_all_ds.py} +13 -16
- mindspore/ops/_op_impl/tbe/reduce_any_ds.py +39 -0
- mindspore/ops/_op_impl/tbe/roi_align_ds.py +44 -0
- mindspore/ops/_op_impl/tbe/roi_align_grad_ds.py +44 -0
- mindspore/ops/_op_impl/tbe/scatter_add.py +2 -0
- mindspore/ops/_op_impl/tbe/scatter_nd_add.py +2 -2
- mindspore/ops/_op_impl/tbe/slice.py +26 -15
- mindspore/ops/_op_impl/tbe/space_to_batch.py +1 -1
- mindspore/ops/_op_impl/tbe/space_to_batch_nd.py +1 -1
- mindspore/ops/_op_impl/tbe/strided_slice_grad_d.py +1 -0
- mindspore/ops/_op_impl/tbe/trans_data_ds.py +15 -5
- mindspore/ops/_op_impl/tbe/unsorted_segment_sum.py +1 -1
- mindspore/ops/_op_impl/tbe/unsorted_segment_sum_ds.py +2 -0
- mindspore/ops/_primitive_cache.py +3 -2
- mindspore/ops/_register_for_op.py +11 -0
- mindspore/ops/_utils/__init__.py +1 -1
- mindspore/ops/_utils/utils.py +20 -41
- mindspore/ops/_vmap/__init__.py +2 -2
- mindspore/ops/_vmap/vmap_array_ops.py +170 -78
- mindspore/ops/_vmap/vmap_base.py +24 -10
- mindspore/ops/_vmap/vmap_convolution_ops.py +7 -10
- mindspore/ops/_vmap/vmap_grad_math_ops.py +4 -4
- mindspore/ops/_vmap/vmap_grad_nn_ops.py +41 -9
- mindspore/ops/_vmap/vmap_image_ops.py +52 -0
- mindspore/ops/_vmap/vmap_math_ops.py +77 -6
- mindspore/ops/_vmap/vmap_nn_ops.py +78 -29
- mindspore/ops/_vmap/vmap_other_ops.py +3 -1
- mindspore/ops/_vmap/vmap_random_ops.py +55 -3
- mindspore/ops/_vmap/vmap_sparse_ops.py +1 -0
- mindspore/ops/bprop_mindir/AdaptiveAvgPool2D_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/AdaptiveMaxPool2D_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/ApproximateEqual_bprop.mindir +18 -19
- mindspore/ops/bprop_mindir/Argmax_bprop.mindir +13 -12
- mindspore/ops/bprop_mindir/Argmin_bprop.mindir +14 -13
- mindspore/ops/bprop_mindir/AssignSub_bprop.mindir +17 -18
- mindspore/ops/bprop_mindir/Assign_bprop.mindir +16 -16
- mindspore/ops/bprop_mindir/AvgPool3D_bprop.mindir +150 -0
- mindspore/ops/bprop_mindir/AvgPool_bprop.mindir +66 -0
- mindspore/ops/bprop_mindir/BCEWithLogitsLoss_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/BNTrainingReduce_bprop.mindir +13 -12
- mindspore/ops/bprop_mindir/BatchNormGrad_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/BatchToSpaceND_bprop.mindir +28 -0
- mindspore/ops/bprop_mindir/BiasAddGrad_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/BinaryCrossEntropy_bprop.mindir +33 -0
- mindspore/ops/bprop_mindir/BroadcastTo_bprop.mindir +306 -0
- mindspore/ops/bprop_mindir/Broadcast_bprop.mindir +12 -8
- mindspore/ops/bprop_mindir/CTCLoss_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Concat_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Conv2DBackpropFilter_bprop.mindir +240 -0
- mindspore/ops/bprop_mindir/Conv2DBackpropInput_bprop.mindir +247 -0
- mindspore/ops/bprop_mindir/Conv2DTranspose_bprop.mindir +247 -0
- mindspore/ops/bprop_mindir/Conv3DTranspose_bprop.mindir +315 -0
- mindspore/ops/bprop_mindir/Conv3D_bprop.mindir +278 -0
- mindspore/ops/bprop_mindir/DType_bprop.mindir +12 -12
- mindspore/ops/bprop_mindir/DeformableOffsets_bprop.mindir +58 -0
- mindspore/ops/bprop_mindir/Depend_bprop.mindir +12 -13
- mindspore/ops/bprop_mindir/DepthToSpace_bprop.mindir +23 -0
- mindspore/ops/bprop_mindir/DepthwiseConv2dNative_bprop.mindir +138 -0
- mindspore/ops/bprop_mindir/DiagPart_bprop.mindir +15 -0
- mindspore/ops/bprop_mindir/Dropout2D_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Dropout3D_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/DropoutDoMask_bprop.mindir +22 -24
- mindspore/ops/bprop_mindir/DropoutGenMask_bprop.mindir +16 -14
- mindspore/ops/bprop_mindir/DropoutGrad_bprop.mindir +27 -0
- mindspore/ops/bprop_mindir/Dropout_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/DynamicGRUV2_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/DynamicRNN_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/DynamicShape_bprop.mindir +12 -12
- mindspore/ops/bprop_mindir/Elu_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/EmbeddingLookup_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Equal_bprop.mindir +18 -19
- mindspore/ops/bprop_mindir/ExpandDims_bprop.mindir +58 -0
- mindspore/ops/bprop_mindir/FastGeLU_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/Flatten_bprop.mindir +54 -0
- mindspore/ops/bprop_mindir/FloorDiv_bprop.mindir +18 -15
- mindspore/ops/bprop_mindir/GatherD_bprop.mindir +26 -0
- mindspore/ops/bprop_mindir/GatherNd_bprop.mindir +57 -0
- mindspore/ops/bprop_mindir/Gather_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/GreaterEqual_bprop.mindir +17 -18
- mindspore/ops/bprop_mindir/Greater_bprop.mindir +18 -19
- mindspore/ops/bprop_mindir/HSigmoid_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/HSwish_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/IOU_bprop.mindir +18 -19
- mindspore/ops/bprop_mindir/InstanceNorm_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/IsFinite_bprop.mindir +13 -12
- mindspore/ops/bprop_mindir/IsInf_bprop.mindir +13 -10
- mindspore/ops/bprop_mindir/IsNan_bprop.mindir +14 -11
- mindspore/ops/bprop_mindir/KLDivLoss_bprop.mindir +126 -0
- mindspore/ops/bprop_mindir/L2Loss_bprop.mindir +15 -0
- mindspore/ops/bprop_mindir/L2Normalize_bprop.mindir +30 -0
- mindspore/ops/bprop_mindir/LRN_bprop.mindir +43 -0
- mindspore/ops/bprop_mindir/LayerNormGrad_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/LessEqual_bprop.mindir +18 -19
- mindspore/ops/bprop_mindir/Less_bprop.mindir +17 -18
- mindspore/ops/bprop_mindir/LinSpace_bprop.mindir +22 -19
- mindspore/ops/bprop_mindir/Load_bprop.mindir +12 -13
- mindspore/ops/bprop_mindir/LogSoftmax_bprop.mindir +23 -0
- mindspore/ops/bprop_mindir/LogicalAnd_bprop.mindir +17 -18
- mindspore/ops/bprop_mindir/LogicalNot_bprop.mindir +14 -13
- mindspore/ops/bprop_mindir/MaskedSelect_bprop.mindir +21 -0
- mindspore/ops/bprop_mindir/MaxPool3DGradGrad_bprop.mindir +74 -0
- mindspore/ops/bprop_mindir/MaxPool3DGrad_bprop.mindir +74 -0
- mindspore/ops/bprop_mindir/MaxPool3D_bprop.mindir +75 -0
- mindspore/ops/bprop_mindir/MaxPoolGradGrad_bprop.mindir +65 -0
- mindspore/ops/bprop_mindir/MaxPoolWithArgmax_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Maximum_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Minimum_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/MirrorPad_bprop.mindir +27 -0
- mindspore/ops/bprop_mindir/Mish_bprop.mindir +35 -0
- mindspore/ops/bprop_mindir/MulNoNan_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/NLLLoss_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/NonZero_bprop.mindir +14 -0
- mindspore/ops/bprop_mindir/NotEqual_bprop.mindir +18 -19
- mindspore/ops/bprop_mindir/OneHot_bprop.mindir +25 -23
- mindspore/ops/bprop_mindir/OnesLike_bprop.mindir +13 -13
- mindspore/ops/bprop_mindir/PReLU_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Pad_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Padding_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/RNNTLoss_bprop.mindir +29 -0
- mindspore/ops/bprop_mindir/ROIAlign_bprop.mindir +82 -0
- mindspore/ops/bprop_mindir/Range_bprop.mindir +21 -19
- mindspore/ops/bprop_mindir/Rank_bprop.mindir +11 -11
- mindspore/ops/bprop_mindir/ReLU6_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/ReLUV2_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/ReduceAll_bprop.mindir +18 -17
- mindspore/ops/bprop_mindir/ReduceAny_bprop.mindir +18 -17
- mindspore/ops/bprop_mindir/ReluGrad_bprop.mindir +19 -23
- mindspore/ops/bprop_mindir/Reshape_bprop.mindir +60 -0
- mindspore/ops/bprop_mindir/ResizeBilinear_bprop.mindir +29 -0
- mindspore/ops/bprop_mindir/ResizeNearestNeighbor_bprop.mindir +89 -0
- mindspore/ops/bprop_mindir/ReverseSequence_bprop.mindir +52 -0
- mindspore/ops/bprop_mindir/ReverseV2_bprop.mindir +22 -0
- mindspore/ops/bprop_mindir/Round_bprop.mindir +14 -13
- mindspore/ops/bprop_mindir/ScatterMax_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/ScatterMin_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/ScatterNdUpdate_bprop.mindir +22 -0
- mindspore/ops/bprop_mindir/ScatterNd_bprop.mindir +24 -0
- mindspore/ops/bprop_mindir/ScatterNonAliasingAdd_bprop.mindir +22 -0
- mindspore/ops/bprop_mindir/ScatterUpdate_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/SeLU_bprop.mindir +21 -0
- mindspore/ops/bprop_mindir/Select_bprop.mindir +30 -34
- mindspore/ops/bprop_mindir/Shape_bprop.mindir +12 -12
- mindspore/ops/bprop_mindir/SigmoidCrossEntropyWithLogits_bprop.mindir +21 -0
- mindspore/ops/bprop_mindir/SigmoidGrad_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Sigmoid_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/Sign_bprop.mindir +13 -12
- mindspore/ops/bprop_mindir/Slice_bprop.mindir +26 -0
- mindspore/ops/bprop_mindir/SmoothL1Loss_bprop.mindir +36 -0
- mindspore/ops/bprop_mindir/SoftmaxCrossEntropyWithLogits_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Softplus_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/Softsign_bprop.mindir +33 -0
- mindspore/ops/bprop_mindir/Sort_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/SpaceToBatchND_bprop.mindir +28 -0
- mindspore/ops/bprop_mindir/SpaceToDepth_bprop.mindir +23 -0
- mindspore/ops/bprop_mindir/SparseGatherV2_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/SparseSoftmaxCrossEntropyWithLogits_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Split_bprop.mindir +22 -0
- mindspore/ops/bprop_mindir/Squeeze_bprop.mindir +54 -0
- mindspore/ops/bprop_mindir/StridedSliceGrad_bprop.mindir +95 -0
- mindspore/ops/bprop_mindir/StridedSlice_bprop.mindir +98 -0
- mindspore/ops/bprop_mindir/Switch_bprop.mindir +28 -32
- mindspore/ops/bprop_mindir/TanhGrad_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/Tanh_bprop.mindir +66 -0
- mindspore/ops/bprop_mindir/TensorScatterAdd_bprop.mindir +22 -0
- mindspore/ops/bprop_mindir/TensorScatterUpdate_bprop.mindir +29 -0
- mindspore/ops/bprop_mindir/TensorShape_bprop.mindir +14 -0
- mindspore/ops/bprop_mindir/Tile_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/TopK_bprop.mindir +0 -0
- mindspore/ops/bprop_mindir/TransShape_bprop.mindir +23 -0
- mindspore/ops/bprop_mindir/TruncateDiv_bprop.mindir +18 -15
- mindspore/ops/bprop_mindir/TupleGetItem_bprop.mindir +11 -13
- mindspore/ops/bprop_mindir/Unique_bprop.mindir +16 -0
- mindspore/ops/bprop_mindir/Unstack_bprop.mindir +22 -0
- mindspore/ops/bprop_mindir/UpsampleNearest3D_bprop.mindir +32 -0
- mindspore/ops/bprop_mindir/UpsampleTrilinear3D_bprop.mindir +38 -0
- mindspore/ops/bprop_mindir/ZerosLike_bprop.mindir +13 -12
- mindspore/ops/bprop_mindir/__init__.py +1 -4
- mindspore/ops/bprop_mindir/generate_mindir.py +32 -20
- mindspore/ops/composite/__init__.py +12 -13
- mindspore/ops/composite/base.py +261 -254
- mindspore/ops/composite/env_ops.py +41 -0
- mindspore/ops/composite/math_ops.py +197 -156
- mindspore/ops/composite/multitype_ops/_compile_utils.py +428 -176
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +188 -87
- mindspore/ops/composite/multitype_ops/add_impl.py +23 -1
- mindspore/ops/composite/multitype_ops/div_impl.py +3 -3
- mindspore/ops/composite/multitype_ops/equal_impl.py +1 -0
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +1 -1
- mindspore/ops/composite/multitype_ops/getitem_impl.py +52 -5
- mindspore/ops/composite/multitype_ops/greater_equal_impl.py +31 -0
- mindspore/ops/composite/multitype_ops/greater_impl.py +31 -0
- mindspore/ops/composite/multitype_ops/in_impl.py +15 -3
- mindspore/ops/composite/multitype_ops/less_equal_impl.py +33 -2
- mindspore/ops/composite/multitype_ops/less_impl.py +33 -0
- mindspore/ops/composite/multitype_ops/logical_and_impl.py +2 -2
- mindspore/ops/composite/multitype_ops/logical_or_impl.py +2 -1
- mindspore/ops/composite/multitype_ops/mod_impl.py +1 -1
- mindspore/ops/composite/multitype_ops/mul_impl.py +21 -7
- mindspore/ops/composite/multitype_ops/not_in_impl.py +15 -3
- mindspore/ops/composite/multitype_ops/ones_like_impl.py +2 -4
- mindspore/ops/composite/multitype_ops/pow_impl.py +1 -0
- mindspore/ops/composite/multitype_ops/setitem_impl.py +62 -70
- mindspore/ops/composite/multitype_ops/sub_impl.py +3 -3
- mindspore/ops/composite/multitype_ops/zeros_like_impl.py +41 -4
- mindspore/ops/function/__init__.py +323 -8
- mindspore/ops/function/array_func.py +3511 -780
- mindspore/ops/function/clip_func.py +329 -0
- mindspore/ops/function/debug_func.py +6 -6
- mindspore/ops/function/grad/__init__.py +5 -1
- mindspore/ops/function/grad/grad_func.py +736 -65
- mindspore/ops/function/image_func.py +270 -0
- mindspore/ops/function/linalg_func.py +268 -8
- mindspore/ops/function/math_func.py +8032 -3164
- mindspore/ops/function/nn_func.py +5619 -1855
- mindspore/ops/function/other_func.py +115 -0
- mindspore/ops/function/parameter_func.py +11 -10
- mindspore/ops/function/random_func.py +939 -77
- mindspore/ops/function/sparse_func.py +249 -84
- mindspore/ops/function/sparse_unary_func.py +2303 -0
- mindspore/ops/function/spectral_func.py +146 -0
- mindspore/ops/function/vmap_func.py +114 -0
- mindspore/ops/functional.py +182 -254
- mindspore/ops/op_info_register.py +79 -34
- mindspore/ops/operations/__init__.py +210 -118
- mindspore/ops/operations/_csr_ops.py +7 -7
- mindspore/ops/operations/_embedding_cache_ops.py +25 -15
- mindspore/ops/operations/_grad_ops.py +447 -322
- mindspore/ops/operations/_inner_ops.py +547 -176
- mindspore/ops/operations/_map_tensor_ops.py +112 -0
- mindspore/ops/operations/_ms_kernel.py +29 -27
- mindspore/ops/operations/_ocr_ops.py +11 -11
- mindspore/ops/operations/_opaque_predicate_registry.py +41 -0
- mindspore/ops/operations/_quant_ops.py +186 -101
- mindspore/ops/operations/_rl_inner_ops.py +122 -61
- mindspore/ops/operations/_scalar_ops.py +466 -0
- mindspore/ops/operations/_sequence_ops.py +1047 -0
- mindspore/ops/operations/_tensor_array.py +10 -11
- mindspore/ops/operations/_thor_ops.py +4 -4
- mindspore/ops/operations/array_ops.py +1428 -1226
- mindspore/ops/operations/comm_ops.py +180 -117
- mindspore/ops/operations/control_ops.py +4 -2
- mindspore/ops/operations/custom_ops.py +185 -98
- mindspore/ops/operations/debug_ops.py +92 -54
- mindspore/ops/operations/image_ops.py +406 -211
- mindspore/ops/operations/inner_ops.py +42 -53
- mindspore/ops/operations/linalg_ops.py +32 -29
- mindspore/ops/operations/math_ops.py +2076 -897
- mindspore/ops/operations/nn_ops.py +1282 -1252
- mindspore/ops/operations/other_ops.py +124 -278
- mindspore/ops/operations/random_ops.py +345 -178
- mindspore/ops/operations/rl_ops.py +8 -9
- mindspore/ops/operations/sparse_ops.py +502 -157
- mindspore/ops/operations/spectral_ops.py +107 -0
- mindspore/ops/primitive.py +192 -15
- mindspore/ops/vm_impl_registry.py +23 -2
- mindspore/parallel/__init__.py +6 -1
- mindspore/parallel/_auto_parallel_context.py +199 -92
- mindspore/parallel/_cell_wrapper.py +4 -2
- mindspore/parallel/_cost_model_context.py +3 -0
- mindspore/parallel/_dp_allreduce_fusion.py +2 -1
- mindspore/parallel/_offload_context.py +185 -0
- mindspore/parallel/_parallel_serialization.py +167 -28
- mindspore/parallel/_ps_context.py +9 -5
- mindspore/parallel/_recovery_context.py +1 -1
- mindspore/parallel/_tensor.py +9 -1
- mindspore/{nn/transformer → parallel/_transformer}/__init__.py +6 -6
- mindspore/{nn/transformer → parallel/_transformer}/layers.py +59 -37
- mindspore/{nn/transformer → parallel/_transformer}/loss.py +4 -7
- mindspore/{nn/transformer → parallel/_transformer}/moe.py +160 -35
- mindspore/{nn/transformer → parallel/_transformer}/op_parallel_config.py +3 -3
- mindspore/{nn/transformer → parallel/_transformer}/transformer.py +235 -196
- mindspore/parallel/_utils.py +47 -7
- mindspore/parallel/algo_parameter_config.py +5 -1
- mindspore/parallel/checkpoint_transform.py +329 -0
- mindspore/parallel/shard.py +229 -0
- mindspore/perf_msvcbuildinsights.dll +0 -0
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/common/util.py +4 -3
- mindspore/profiler/common/validator/validate_path.py +2 -2
- mindspore/profiler/envprofiling.py +249 -0
- mindspore/profiler/parser/aicpu_data_parser.py +38 -39
- mindspore/profiler/parser/ascend_timeline_generator.py +497 -0
- mindspore/profiler/parser/base_timeline_generator.py +471 -0
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +684 -0
- mindspore/profiler/parser/framework_parser.py +42 -16
- mindspore/profiler/parser/hccl_parser.py +158 -158
- mindspore/profiler/parser/hwts_log_parser.py +7 -6
- mindspore/profiler/parser/integrator.py +18 -1579
- mindspore/profiler/parser/minddata_analyzer.py +8 -8
- mindspore/profiler/parser/msadvisor_analyzer.py +14 -27
- mindspore/profiler/parser/msadvisor_parser.py +2 -4
- mindspore/profiler/parser/optime_parser.py +17 -18
- mindspore/profiler/parser/profiler_info.py +108 -0
- mindspore/profiler/parser/step_trace_parser.py +1 -1
- mindspore/profiler/profiling.py +396 -194
- mindspore/rewrite/__init__.py +6 -2
- mindspore/rewrite/api/node.py +51 -110
- mindspore/rewrite/api/node_type.py +10 -6
- mindspore/rewrite/api/pattern_engine.py +51 -7
- mindspore/rewrite/api/scoped_value.py +64 -53
- mindspore/rewrite/api/symbol_tree.py +108 -61
- mindspore/rewrite/api/tree_node_helper.py +2 -3
- mindspore/{compression/quant/__init__.py → rewrite/ast_creator_register.py} +20 -11
- mindspore/rewrite/ast_helpers/__init__.py +6 -3
- mindspore/rewrite/ast_helpers/ast_creator.py +115 -0
- mindspore/rewrite/ast_helpers/ast_finder.py +99 -1
- mindspore/rewrite/ast_helpers/ast_modifier.py +17 -4
- mindspore/rewrite/ast_helpers/ast_replacer.py +1 -1
- mindspore/rewrite/ast_transformers/__init__.py +0 -1
- mindspore/rewrite/ast_transformers/flatten_recursive_stmt.py +46 -5
- mindspore/rewrite/ast_transformers/remove_return_out_of_if.py +6 -3
- mindspore/rewrite/common/__init__.py +2 -0
- mindspore/rewrite/common/event.py +1 -1
- mindspore/rewrite/common/observable.py +1 -1
- mindspore/rewrite/common/observer.py +1 -1
- mindspore/rewrite/common/rewrite_elog.py +35 -0
- mindspore/rewrite/namer.py +2 -2
- mindspore/rewrite/namespace.py +14 -4
- mindspore/rewrite/node.py +161 -13
- mindspore/rewrite/parser.py +0 -1
- mindspore/rewrite/parser_register.py +0 -1
- mindspore/rewrite/parsers/arguments_parser.py +3 -2
- mindspore/rewrite/parsers/assign_parser.py +267 -67
- mindspore/rewrite/parsers/attribute_parser.py +56 -0
- mindspore/rewrite/parsers/class_def_parser.py +191 -108
- mindspore/rewrite/parsers/constant_parser.py +101 -0
- mindspore/rewrite/parsers/container_parser.py +88 -0
- mindspore/rewrite/parsers/for_parser.py +28 -15
- mindspore/rewrite/parsers/function_def_parser.py +21 -5
- mindspore/rewrite/parsers/if_parser.py +11 -28
- mindspore/rewrite/parsers/module_parser.py +9 -6
- mindspore/rewrite/parsers/return_parser.py +3 -2
- mindspore/rewrite/sparsify/__init__.py +0 -0
- mindspore/rewrite/sparsify/sparse_transformer.py +448 -0
- mindspore/rewrite/sparsify/sparsify.py +109 -0
- mindspore/rewrite/sparsify/utils.py +173 -0
- mindspore/rewrite/symbol_tree.py +322 -109
- mindspore/rewrite/symbol_tree_builder.py +45 -8
- mindspore/rewrite/symbol_tree_dumper.py +0 -1
- mindspore/rewrite/topological_manager.py +1 -2
- mindspore/run_check/_check_version.py +209 -112
- mindspore/run_check/run_check.py +2 -1
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +6 -4
- mindspore/train/_utils.py +28 -5
- mindspore/train/amp.py +321 -50
- mindspore/train/callback/__init__.py +3 -1
- mindspore/train/callback/_backup_and_restore.py +120 -0
- mindspore/train/callback/_callback.py +8 -8
- mindspore/train/callback/_checkpoint.py +12 -9
- mindspore/train/callback/_early_stop.py +13 -7
- mindspore/train/callback/_history.py +8 -8
- mindspore/train/callback/_lambda_callback.py +6 -6
- mindspore/train/callback/_landscape.py +36 -38
- mindspore/train/callback/_loss_monitor.py +12 -6
- mindspore/train/callback/_lr_scheduler_callback.py +2 -4
- mindspore/train/callback/_on_request_exit.py +212 -0
- mindspore/train/callback/_reduce_lr_on_plateau.py +13 -7
- mindspore/train/callback/_summary_collector.py +27 -19
- mindspore/train/callback/_time_monitor.py +13 -7
- mindspore/train/checkpoint_pb2.py +68 -8
- mindspore/train/data_sink.py +122 -33
- mindspore/train/dataset_helper.py +28 -87
- mindspore/train/loss_scale_manager.py +4 -7
- mindspore/{nn → train}/metrics/__init__.py +20 -20
- mindspore/{nn → train}/metrics/accuracy.py +12 -10
- mindspore/{nn → train}/metrics/auc.py +4 -4
- mindspore/{nn → train}/metrics/bleu_score.py +4 -4
- mindspore/{nn → train}/metrics/confusion_matrix.py +10 -8
- mindspore/{nn → train}/metrics/cosine_similarity.py +4 -4
- mindspore/{nn → train}/metrics/dice.py +6 -5
- mindspore/{nn → train}/metrics/error.py +7 -5
- mindspore/{nn → train}/metrics/fbeta.py +9 -7
- mindspore/{nn → train}/metrics/hausdorff_distance.py +8 -6
- mindspore/{nn → train}/metrics/loss.py +4 -3
- mindspore/{nn → train}/metrics/mean_surface_distance.py +6 -5
- mindspore/{nn → train}/metrics/metric.py +6 -5
- mindspore/{nn → train}/metrics/occlusion_sensitivity.py +4 -3
- mindspore/{nn → train}/metrics/perplexity.py +5 -4
- mindspore/{nn → train}/metrics/precision.py +5 -4
- mindspore/{nn → train}/metrics/recall.py +5 -4
- mindspore/{nn → train}/metrics/roc.py +7 -6
- mindspore/{nn → train}/metrics/root_mean_square_surface_distance.py +6 -5
- mindspore/{nn → train}/metrics/topk.py +7 -5
- mindspore/train/mind_ir_pb2.py +339 -32
- mindspore/train/model.py +113 -84
- mindspore/train/serialization.py +547 -167
- mindspore/train/summary/_summary_adapter.py +1 -1
- mindspore/train/summary/summary_record.py +43 -12
- mindspore/train/train_thor/convert_utils.py +7 -1
- mindspore/train/train_thor/dataset_helper.py +3 -3
- mindspore/train/train_thor/model_thor.py +0 -4
- mindspore/turbojpeg.dll +0 -0
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-1.10.0.dist-info → mindspore-2.0.0rc1.dist-info}/METADATA +4 -3
- {mindspore-1.10.0.dist-info → mindspore-2.0.0rc1.dist-info}/RECORD +901 -660
- mindspore/compression/common/constant.py +0 -124
- mindspore/compression/export/__init__.py +0 -19
- mindspore/compression/export/quant_export.py +0 -514
- mindspore/compression/quant/qat.py +0 -636
- mindspore/compression/quant/quant_utils.py +0 -462
- mindspore/compression/quant/quantizer.py +0 -68
- mindspore/libatomic-1.dll +0 -0
- mindspore/libgcc_s_seh-1.dll +0 -0
- mindspore/libgfortran-4.dll +0 -0
- mindspore/libgomp-1.dll +0 -0
- mindspore/libjpeg-62.dll +0 -0
- mindspore/libmindspore.dll +0 -0
- mindspore/libmindspore_common.dll +0 -0
- mindspore/libmindspore_core.dll +0 -0
- mindspore/libmindspore_glog.dll +0 -0
- mindspore/libnnacl.dll +0 -0
- mindspore/libopencv_core452.dll +0 -0
- mindspore/libopencv_imgcodecs452.dll +0 -0
- mindspore/libopencv_imgproc452.dll +0 -0
- mindspore/libquadmath-0.dll +0 -0
- mindspore/libsqlite3.dll +0 -0
- mindspore/libssp-0.dll +0 -0
- mindspore/libstdc++-6.dll +0 -0
- mindspore/libtinyxml2.dll +0 -0
- mindspore/libturbojpeg.dll +0 -0
- mindspore/libwinpthread-1.dll +0 -0
- mindspore/nn/layer/quant.py +0 -1868
- mindspore/nn/layer/rnn_utils.py +0 -90
- mindspore/nn/probability/dpn/__init__.py +0 -22
- mindspore/nn/probability/dpn/vae/__init__.py +0 -25
- mindspore/nn/probability/dpn/vae/cvae.py +0 -138
- mindspore/nn/probability/dpn/vae/vae.py +0 -122
- mindspore/nn/probability/infer/__init__.py +0 -22
- mindspore/nn/probability/infer/variational/elbo.py +0 -70
- mindspore/nn/probability/infer/variational/svi.py +0 -84
- mindspore/nn/probability/toolbox/__init__.py +0 -22
- mindspore/nn/probability/toolbox/anomaly_detection.py +0 -99
- mindspore/nn/probability/toolbox/uncertainty_evaluation.py +0 -363
- mindspore/nn/probability/transforms/__init__.py +0 -22
- mindspore/nn/probability/transforms/transform_bnn.py +0 -262
- mindspore/nn/probability/zhusuan/__init__.py +0 -18
- mindspore/nn/probability/zhusuan/framework/__init__.py +0 -18
- mindspore/nn/probability/zhusuan/framework/bn.py +0 -95
- mindspore/nn/probability/zhusuan/variational/__init__.py +0 -18
- mindspore/nn/probability/zhusuan/variational/elbo.py +0 -46
- mindspore/ops/_op_impl/tbe/bias_add_grad_ds.py +0 -52
- mindspore/ops/_op_impl/tbe/scatter_nd_add_ds.py +0 -43
- mindspore/ops/bprop_mindir/AssignAdd_bprop.mindir +0 -20
- mindspore/ops/bprop_mindir/Identity_bprop.mindir +0 -9
- mindspore/ops/bprop_mindir/LogicalOr_bprop.mindir +0 -20
- mindspore/ops/bprop_mindir/ReLU_bprop.mindir +0 -16
- mindspore/ops/bprop_mindir/UpdateState_bprop.mindir +0 -17
- mindspore/ops/bprop_mindir/stop_gradient_bprop.mindir +0 -12
- mindspore/ops/composite/array_ops.py +0 -210
- mindspore/ops/composite/clip_ops.py +0 -238
- mindspore/ops/composite/random_ops.py +0 -426
- mindspore/ops/composite/vmap_ops.py +0 -38
- mindspore/ops/operations/sponge_ops.py +0 -3531
- mindspore/ops/operations/sponge_update_ops.py +0 -2546
- mindspore/parallel/nn/__init__.py +0 -42
- mindspore/parallel/nn/loss.py +0 -22
- mindspore/parallel/nn/moe.py +0 -21
- mindspore/parallel/nn/op_parallel_config.py +0 -22
- mindspore/parallel/nn/transformer.py +0 -31
- mindspore/run_check/_check_deps_version.py +0 -84
- {mindspore-1.10.0.dist-info → mindspore-2.0.0rc1.dist-info}/WHEEL +0 -0
- {mindspore-1.10.0.dist-info → mindspore-2.0.0rc1.dist-info}/entry_points.txt +0 -0
- {mindspore-1.10.0.dist-info → mindspore-2.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -16,22 +16,13 @@
|
|
|
16
16
|
import csv
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
|
-
import stat
|
|
20
19
|
from decimal import Decimal
|
|
21
20
|
from enum import Enum
|
|
22
21
|
import sys
|
|
23
|
-
|
|
24
22
|
from mindspore import log as logger
|
|
25
|
-
from mindspore import
|
|
26
|
-
from mindspore.context import get_auto_parallel_context
|
|
27
|
-
from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \
|
|
28
|
-
ProfilerFileNotFoundException, ProfilerRawFileException, ProfilerParamValueErrorException
|
|
23
|
+
from mindspore.profiler.common.exceptions.exceptions import ProfilerRawFileException
|
|
29
24
|
from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond
|
|
30
25
|
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
|
|
31
|
-
from mindspore.profiler.parser.container import TimelineContainer
|
|
32
|
-
from mindspore.profiler.parser.op_intermediate_parser import OPIntermediateParser
|
|
33
|
-
|
|
34
|
-
SIZE_LIMIT_DEFAULT = 20 * 1024 * 1024 # 20MB
|
|
35
26
|
|
|
36
27
|
|
|
37
28
|
class Integrator:
|
|
@@ -46,9 +37,8 @@ class Integrator:
|
|
|
46
37
|
_file_name_aicore_detail_time = 'output_op_compute_time_{}.txt'
|
|
47
38
|
_file_name_aicpu_time = 'output_data_preprocess_aicpu_{}.txt'
|
|
48
39
|
_file_name_framework = 'framework_raw_{}.csv'
|
|
49
|
-
_header_aicore_type = ['op_type', '
|
|
50
|
-
|
|
51
|
-
_header_aicore_detail = ['full_op_name', 'execution_time']
|
|
40
|
+
_header_aicore_type = ['op_type', 'total_time', 'execution_frequency', 'percent']
|
|
41
|
+
_header_aicore_detail = ['full_op_name', 'execution_time', 'execution_frequency']
|
|
52
42
|
_header_aicpu = ['serial_number', 'op_type', 'total_time', 'dispatch_time',
|
|
53
43
|
'execution_time', 'run_start', 'run_end']
|
|
54
44
|
|
|
@@ -62,13 +52,13 @@ class Integrator:
|
|
|
62
52
|
_aicore_trace_data = []
|
|
63
53
|
|
|
64
54
|
def __init__(self, profiling_dir, device_id):
|
|
55
|
+
csv.field_size_limit(sys.maxsize)
|
|
65
56
|
self._profiling_dir = profiling_dir
|
|
66
57
|
self._device_id = device_id
|
|
67
58
|
self._op_time_cache = {}
|
|
68
59
|
self._total_time = Decimal('0.0')
|
|
69
60
|
self._column = ""
|
|
70
61
|
self._result = []
|
|
71
|
-
csv.field_size_limit(sys.maxsize)
|
|
72
62
|
|
|
73
63
|
@staticmethod
|
|
74
64
|
def _is_match_condition(exp_key, exp_value, actual_value):
|
|
@@ -150,16 +140,17 @@ class Integrator:
|
|
|
150
140
|
op_name_type_cache[row[3]] = row[5]
|
|
151
141
|
|
|
152
142
|
op_type_time_cache = {}
|
|
153
|
-
|
|
143
|
+
total_time = 0
|
|
144
|
+
for full_op_name, op_info in self._op_time_cache.items():
|
|
145
|
+
total_time += op_info[0] * op_info[1]
|
|
154
146
|
op_type = op_name_type_cache.get(full_op_name)
|
|
155
147
|
op_type_time = op_type_time_cache.get(op_type)
|
|
156
148
|
if not op_type_time:
|
|
157
|
-
op_type_time = [
|
|
149
|
+
op_type_time = [op_info[0] * op_info[1], op_info[1]]
|
|
158
150
|
op_type_time_cache[op_type] = op_type_time
|
|
159
151
|
else:
|
|
160
|
-
op_type_time[0] +=
|
|
161
|
-
op_type_time[1] += 1
|
|
162
|
-
|
|
152
|
+
op_type_time[0] += op_info[0] * op_info[1]
|
|
153
|
+
op_type_time[1] += op_info[1]
|
|
163
154
|
op_type_file_name = 'aicore_intermediate_' + self._device_id + '_type.csv'
|
|
164
155
|
op_type_file_path = os.path.join(self._profiling_dir, op_type_file_name)
|
|
165
156
|
with open(op_type_file_path, 'w') as type_file:
|
|
@@ -167,10 +158,11 @@ class Integrator:
|
|
|
167
158
|
csv_writer.writerow(self._header_aicore_type)
|
|
168
159
|
|
|
169
160
|
for op_type, op_type_time_info in op_type_time_cache.items():
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
161
|
+
if total_time != 0:
|
|
162
|
+
type_info = [
|
|
163
|
+
op_type, op_type_time_info[0], op_type_time_info[1],
|
|
164
|
+
round((op_type_time_info[0] / total_time) * 100, 2)
|
|
165
|
+
]
|
|
174
166
|
csv_writer.writerow(type_info)
|
|
175
167
|
|
|
176
168
|
def _parse_aicore_detail_time(self):
|
|
@@ -210,8 +202,8 @@ class Integrator:
|
|
|
210
202
|
if op_infos[0] == 'total':
|
|
211
203
|
self._total_time = Decimal(op_infos[2])
|
|
212
204
|
continue
|
|
213
|
-
self._op_time_cache[op_infos[0]] = Decimal(op_infos[1])
|
|
214
|
-
csv_writer.writerow([op_infos[0], op_infos[1]])
|
|
205
|
+
self._op_time_cache[op_infos[0]] = [Decimal(op_infos[1]), int(op_infos[3])]
|
|
206
|
+
csv_writer.writerow([op_infos[0], op_infos[1], op_infos[3]])
|
|
215
207
|
|
|
216
208
|
def _parse_aicpu_time(self):
|
|
217
209
|
"""Parse the parsed AICPU operator time file."""
|
|
@@ -303,7 +295,7 @@ class Integrator:
|
|
|
303
295
|
"""Load data according to the parsed AICORE operator types file."""
|
|
304
296
|
file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id))
|
|
305
297
|
if not file_path:
|
|
306
|
-
logger.warning("Failed to find parsed trace time file.
|
|
298
|
+
logger.warning("Failed to find parsed trace time file.")
|
|
307
299
|
return
|
|
308
300
|
file_path = validate_and_normalize_path(file_path)
|
|
309
301
|
with open(file_path, 'r') as handle:
|
|
@@ -519,1556 +511,3 @@ class DeviceTarget(Enum):
|
|
|
519
511
|
CPU = 'cpu'
|
|
520
512
|
GPU = 'gpu'
|
|
521
513
|
ASCEND = 'ascend'
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
class BaseTimelineGenerator:
|
|
525
|
-
"""
|
|
526
|
-
Analyse timeline data from file.
|
|
527
|
-
"""
|
|
528
|
-
# AI Core Op pid is device_id
|
|
529
|
-
_AI_CPU_PID = 9000
|
|
530
|
-
_COMMUNICATION_OP_PID = 10000
|
|
531
|
-
_HOST_CPU_PID = 11000
|
|
532
|
-
_OP_OVERLAP_PID = 12000
|
|
533
|
-
|
|
534
|
-
_OP_GPU_ACTIVITY_PID = 13000
|
|
535
|
-
|
|
536
|
-
_RECEIVE_ALONE = 7997
|
|
537
|
-
_ALLREDUCE_ALONE = 7998
|
|
538
|
-
_MERGED_COMPUTATION_TID = 7999
|
|
539
|
-
_PURE_COMMUNICATION_TID = 8000
|
|
540
|
-
_MERGED_COMMUNICATION_TID = 8001
|
|
541
|
-
_FREE_TIME_TID = 8002
|
|
542
|
-
_STEPS_TID = 100000
|
|
543
|
-
_SCOPE_NAME_TID = 100001
|
|
544
|
-
_GPU_OP_TID = 100002
|
|
545
|
-
_HOST_CPU_OP_TID = 100003
|
|
546
|
-
_SINGLE_TID = 0
|
|
547
|
-
|
|
548
|
-
_STEPS_SORT_INDEX = -4
|
|
549
|
-
|
|
550
|
-
_output_timeline_data_file_path = 'output_timeline_data_{}.txt'
|
|
551
|
-
_timeline_meta = []
|
|
552
|
-
_format_meta_data_list = []
|
|
553
|
-
_thread_processed_list = []
|
|
554
|
-
|
|
555
|
-
_map_tid_name_to_int = {
|
|
556
|
-
"Steps": (-4, _STEPS_TID),
|
|
557
|
-
"Scope Name": (-3, _SCOPE_NAME_TID),
|
|
558
|
-
"GpuOps": (-2, _GPU_OP_TID),
|
|
559
|
-
"HostCpuOps": (-1, _HOST_CPU_OP_TID)
|
|
560
|
-
}
|
|
561
|
-
_timeline_summary = {
|
|
562
|
-
'total_time': 0,
|
|
563
|
-
'num_of_streams': 0,
|
|
564
|
-
'num_of_ops': 0,
|
|
565
|
-
'op_exe_times': 0,
|
|
566
|
-
'max_scope_name_num': 0,
|
|
567
|
-
}
|
|
568
|
-
_op_name_idx, _tid_idx, _start_time_idx, _duration_idx = 0, 1, 2, 3
|
|
569
|
-
_max_scope_name_num = 0
|
|
570
|
-
_host_cpu_op_label = 'Host CPU OP'
|
|
571
|
-
_gpu_op_label = "GPU Op"
|
|
572
|
-
_ascend_op_label = "Ascend Op"
|
|
573
|
-
_aicore_op_label = "AICORE OP"
|
|
574
|
-
_aicpu_op_label = "AICPU OP"
|
|
575
|
-
|
|
576
|
-
_device_id = 0
|
|
577
|
-
_profiling_dir = ""
|
|
578
|
-
_timeline_summary_filename = ""
|
|
579
|
-
_display_filename = ""
|
|
580
|
-
_op_name_list = []
|
|
581
|
-
_device_target = DeviceTarget.ASCEND.value
|
|
582
|
-
_model = context.GRAPH_MODE
|
|
583
|
-
|
|
584
|
-
__col_names = ['op_name', 'stream_id', 'start_time', 'duration']
|
|
585
|
-
|
|
586
|
-
def __init__(self, device_target, model):
|
|
587
|
-
self._tid_dict = {
|
|
588
|
-
"computation_op": (self._MERGED_COMPUTATION_TID, self._OP_OVERLAP_PID),
|
|
589
|
-
"communication_not_overlapped": (self._PURE_COMMUNICATION_TID, self._OP_OVERLAP_PID),
|
|
590
|
-
"communication": (self._MERGED_COMMUNICATION_TID, self._OP_OVERLAP_PID),
|
|
591
|
-
"free_time": (self._FREE_TIME_TID, self._OP_OVERLAP_PID)
|
|
592
|
-
}
|
|
593
|
-
self._device_target = str(device_target).lower()
|
|
594
|
-
self._model = model
|
|
595
|
-
self._step_start_op_name = ""
|
|
596
|
-
self._step_end_op_name = ""
|
|
597
|
-
|
|
598
|
-
@staticmethod
|
|
599
|
-
def get_parallel_context():
|
|
600
|
-
"""Get parallel context."""
|
|
601
|
-
try:
|
|
602
|
-
parallel_mode = get_auto_parallel_context("parallel_mode")
|
|
603
|
-
stage_num = get_auto_parallel_context("pipeline_stages")
|
|
604
|
-
except RuntimeError:
|
|
605
|
-
logger.warning("[profiler] the feature of cluster bottleneck analyse "
|
|
606
|
-
"is not supported in offline parse mode.")
|
|
607
|
-
parallel_mode = "data_parallel"
|
|
608
|
-
stage_num = 1
|
|
609
|
-
if stage_num > 1:
|
|
610
|
-
parallel_mode = "pipeline-parallel"
|
|
611
|
-
elif parallel_mode != "data_parallel":
|
|
612
|
-
parallel_mode = "model-parallel"
|
|
613
|
-
else:
|
|
614
|
-
parallel_mode = "data-parallel"
|
|
615
|
-
return parallel_mode, stage_num
|
|
616
|
-
|
|
617
|
-
@staticmethod
|
|
618
|
-
def _update_num_of_streams(timeline, stream_count_dict):
|
|
619
|
-
"""Update number of streams."""
|
|
620
|
-
stream_id = timeline[1]
|
|
621
|
-
if stream_id in ["Steps", "Scope Name"]:
|
|
622
|
-
return
|
|
623
|
-
if stream_id not in stream_count_dict.keys():
|
|
624
|
-
stream_count_dict[stream_id] = 1
|
|
625
|
-
else:
|
|
626
|
-
stream_count_dict[stream_id] += 1
|
|
627
|
-
|
|
628
|
-
def get_thread_label_name(self):
|
|
629
|
-
"""Get process and thread config."""
|
|
630
|
-
device_process_label = self._get_device_process_label()
|
|
631
|
-
return [
|
|
632
|
-
{"name": "process_labels", "ph": "M", "pid": self._device_id, "args": {"labels": device_process_label}},
|
|
633
|
-
{"name": "process_labels", "ph": "M", "pid": self._AI_CPU_PID, "args": {"labels": self._aicpu_op_label}},
|
|
634
|
-
{"name": "process_labels", "ph": "M", "pid": self._COMMUNICATION_OP_PID,
|
|
635
|
-
"args": {"labels": "Communication Op"}},
|
|
636
|
-
{"name": "process_labels", "ph": "M", "pid": self._HOST_CPU_PID,
|
|
637
|
-
"args": {"labels": self._host_cpu_op_label}},
|
|
638
|
-
{"name": "process_labels", "ph": "M", "pid": self._OP_OVERLAP_PID,
|
|
639
|
-
"args": {"labels": "Op Overlap Analyse"}},
|
|
640
|
-
{"name": "process_labels", "ph": "M", "pid": self._OP_GPU_ACTIVITY_PID,
|
|
641
|
-
"args": {"labels": "Activity Op"}},
|
|
642
|
-
|
|
643
|
-
{"name": "process_sort_index", "ph": "M", "pid": self._device_id, "args": {"sort_index": 0}},
|
|
644
|
-
{"name": "process_sort_index", "ph": "M", "pid": self._AI_CPU_PID, "args": {"sort_index": 10}},
|
|
645
|
-
{"name": "process_sort_index", "ph": "M", "pid": self._COMMUNICATION_OP_PID, "args": {"sort_index": 20}},
|
|
646
|
-
{"name": "process_sort_index", "ph": "M", "pid": self._HOST_CPU_PID, "args": {"sort_index": 30}},
|
|
647
|
-
{"name": "process_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "args": {"sort_index": 40}},
|
|
648
|
-
|
|
649
|
-
{"name": "thread_name", "ph": "M", "pid": self._HOST_CPU_PID, "tid": self._HOST_CPU_OP_TID,
|
|
650
|
-
"args": {"name": "Host CPU Op"}},
|
|
651
|
-
{"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMPUTATION_TID,
|
|
652
|
-
"args": {"name": "Merged Computation Op"}},
|
|
653
|
-
{"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._PURE_COMMUNICATION_TID,
|
|
654
|
-
"args": {"name": "Pure Communication Op"}},
|
|
655
|
-
{"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMMUNICATION_TID,
|
|
656
|
-
"args": {"name": "Merged Communication Op"}},
|
|
657
|
-
{"name": "thread_name", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._FREE_TIME_TID,
|
|
658
|
-
"args": {"name": "Free Time"}},
|
|
659
|
-
{"name": "thread_name", "ph": "M", "pid": self._device_id, "tid": self._STEPS_TID,
|
|
660
|
-
"args": {"name": "Steps"}},
|
|
661
|
-
{"name": "thread_name", "ph": "M", "pid": self._device_id, "tid": self._SINGLE_TID,
|
|
662
|
-
"args": {"name": "Ops"}},
|
|
663
|
-
|
|
664
|
-
{"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMPUTATION_TID,
|
|
665
|
-
"args": {"sort_index": self._MERGED_COMPUTATION_TID}},
|
|
666
|
-
{"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._PURE_COMMUNICATION_TID,
|
|
667
|
-
"args": {"sort_index": self._PURE_COMMUNICATION_TID}},
|
|
668
|
-
{"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._MERGED_COMMUNICATION_TID,
|
|
669
|
-
"args": {"sort_index": self._MERGED_COMMUNICATION_TID}},
|
|
670
|
-
{"name": "thread_sort_index", "ph": "M", "pid": self._OP_OVERLAP_PID, "tid": self._FREE_TIME_TID,
|
|
671
|
-
"args": {"sort_index": self._FREE_TIME_TID}},
|
|
672
|
-
{"name": "thread_sort_index", "ph": "M", "pid": self._device_id, "tid": self._STEPS_TID,
|
|
673
|
-
"args": {"sort_index": self._STEPS_SORT_INDEX}},
|
|
674
|
-
]
|
|
675
|
-
|
|
676
|
-
def write_timeline(self, size_limit=SIZE_LIMIT_DEFAULT):
|
|
677
|
-
"""Load data according to the parsed profiling files."""
|
|
678
|
-
# Write timeline to file.
|
|
679
|
-
logger.info('Writing timeline file...')
|
|
680
|
-
timeline_meta = self.write_timeline_to_json_by_limitation(size_limit)
|
|
681
|
-
logger.info('Finished file writing!')
|
|
682
|
-
return timeline_meta
|
|
683
|
-
|
|
684
|
-
def write_timeline_to_json_by_limitation(self, size_limit):
|
|
685
|
-
"""Write timeline to json by limitation."""
|
|
686
|
-
display_file_path = os.path.join(
|
|
687
|
-
self._profiling_dir,
|
|
688
|
-
self._display_filename
|
|
689
|
-
)
|
|
690
|
-
display_file_path = validate_and_normalize_path(display_file_path)
|
|
691
|
-
|
|
692
|
-
try:
|
|
693
|
-
with open(display_file_path, 'w') as json_file:
|
|
694
|
-
json_file.write('[')
|
|
695
|
-
for _, item in enumerate(self._timeline_meta):
|
|
696
|
-
json.dump(item, json_file)
|
|
697
|
-
if "scope_level" in item.keys():
|
|
698
|
-
self._max_scope_name_num = max(
|
|
699
|
-
self._max_scope_name_num, item["scope_level"] + 1)
|
|
700
|
-
file_size = os.path.getsize(display_file_path)
|
|
701
|
-
json_file.write(',')
|
|
702
|
-
if file_size > size_limit:
|
|
703
|
-
break
|
|
704
|
-
label_name_json = json.dumps(self.get_thread_label_name())
|
|
705
|
-
label_name_json = label_name_json.lstrip('[')
|
|
706
|
-
json_file.write(label_name_json)
|
|
707
|
-
os.chmod(display_file_path, stat.S_IREAD | stat.S_IWRITE)
|
|
708
|
-
return self._timeline_meta
|
|
709
|
-
except (IOError, OSError) as err:
|
|
710
|
-
logger.critical('Error occurred when write timeline display file: %s', err)
|
|
711
|
-
raise ProfilerIOException() from err
|
|
712
|
-
|
|
713
|
-
def write_timeline_summary(self):
|
|
714
|
-
"""Write timeline summary to json."""
|
|
715
|
-
timeline_summary_file_path = os.path.join(
|
|
716
|
-
self._profiling_dir,
|
|
717
|
-
self._timeline_summary_filename
|
|
718
|
-
)
|
|
719
|
-
|
|
720
|
-
timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path)
|
|
721
|
-
|
|
722
|
-
try:
|
|
723
|
-
with open(timeline_summary_file_path, 'w') as json_file:
|
|
724
|
-
json.dump(self._timeline_summary, json_file)
|
|
725
|
-
os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE)
|
|
726
|
-
except (IOError, OSError) as err:
|
|
727
|
-
logger.critical('Error occurred when write timeline summary file: %s', err)
|
|
728
|
-
raise ProfilerIOException() from err
|
|
729
|
-
|
|
730
|
-
def _get_device_process_label(self):
|
|
731
|
-
"""Get device process label."""
|
|
732
|
-
device_process_label = self._aicore_op_label
|
|
733
|
-
if self._device_target == DeviceTarget.ASCEND.value:
|
|
734
|
-
if self._model == context.GRAPH_MODE:
|
|
735
|
-
device_process_label = self._aicore_op_label
|
|
736
|
-
elif self._model == context.PYNATIVE_MODE:
|
|
737
|
-
device_process_label = self._ascend_op_label
|
|
738
|
-
elif self._device_target == DeviceTarget.GPU.value:
|
|
739
|
-
device_process_label = self._gpu_op_label
|
|
740
|
-
elif self._device_target == DeviceTarget.CPU.value:
|
|
741
|
-
device_process_label = self._host_cpu_op_label
|
|
742
|
-
return device_process_label
|
|
743
|
-
|
|
744
|
-
def _get_merged_time_list(self, time_list, get_interval_time=False, display_name="computation_op", factor=1):
|
|
745
|
-
"""
|
|
746
|
-
Get merged time segment list.
|
|
747
|
-
|
|
748
|
-
The process of merge is, for example, there is a list [[1,5], [2,6], [7,8]],
|
|
749
|
-
each items in this list contains a start_time and end_time,
|
|
750
|
-
the merged result is [[1,6], [7,8]].
|
|
751
|
-
"""
|
|
752
|
-
time_merged_segment_list = []
|
|
753
|
-
tid = self._tid_dict.get(display_name, (0, 0))[0]
|
|
754
|
-
pid = self._tid_dict.get(display_name, (0, 0))[1]
|
|
755
|
-
for time_item in time_list:
|
|
756
|
-
time_segment = list(map(float, time_item[self._start_time_idx:self._duration_idx + 1]))
|
|
757
|
-
time_segment[1] = time_segment[0] + time_segment[1] / factor
|
|
758
|
-
if not time_merged_segment_list or \
|
|
759
|
-
time_segment[0] > time_merged_segment_list[-1]:
|
|
760
|
-
time_merged_segment_list.extend(time_segment)
|
|
761
|
-
else:
|
|
762
|
-
time_merged_segment_list[-1] = max(
|
|
763
|
-
time_merged_segment_list[-1],
|
|
764
|
-
time_segment[1]
|
|
765
|
-
)
|
|
766
|
-
|
|
767
|
-
# merged_display_list data used for ui page.
|
|
768
|
-
merged_display_list = [
|
|
769
|
-
[display_name, tid, time_merged_segment_list[i * 2],
|
|
770
|
-
(time_merged_segment_list[i * 2 + 1] - time_merged_segment_list[i * 2]) * factor, pid] for i in \
|
|
771
|
-
range(len(time_merged_segment_list) // 2)
|
|
772
|
-
]
|
|
773
|
-
|
|
774
|
-
if get_interval_time:
|
|
775
|
-
time_merged_segment_list = time_merged_segment_list[1:-1]
|
|
776
|
-
|
|
777
|
-
# merged_res_list data used to compute overlap with other time_list.
|
|
778
|
-
merged_res_list = []
|
|
779
|
-
for i in range(len(time_merged_segment_list) // 2):
|
|
780
|
-
merged_res_list.append([display_name, tid, time_merged_segment_list[i * 2],
|
|
781
|
-
time_merged_segment_list[i * 2 + 1], pid])
|
|
782
|
-
|
|
783
|
-
# interval_display_list is interval time used for ui page.
|
|
784
|
-
interval_display_list = [
|
|
785
|
-
[display_name, tid, time_merged_segment_list[i * 2],
|
|
786
|
-
(time_merged_segment_list[i * 2 + 1] - time_merged_segment_list[i * 2]) * factor, pid]
|
|
787
|
-
for i in range(len(time_merged_segment_list) // 2)
|
|
788
|
-
]
|
|
789
|
-
|
|
790
|
-
return merged_res_list, interval_display_list, merged_display_list
|
|
791
|
-
|
|
792
|
-
def _update_format_meta_data(self, timeline_dict):
|
|
793
|
-
"""Update format meta data which control the display arrange and map the thread name."""
|
|
794
|
-
thread_name_meta_data = {
|
|
795
|
-
"name": "thread_name",
|
|
796
|
-
"pid": int(self._device_id),
|
|
797
|
-
"tid": 100000,
|
|
798
|
-
"ts": 0,
|
|
799
|
-
"ph": "M",
|
|
800
|
-
"cat": "__metadata",
|
|
801
|
-
"args": {
|
|
802
|
-
"name": "Steps"
|
|
803
|
-
}
|
|
804
|
-
}
|
|
805
|
-
tid_name = timeline_dict['tid']
|
|
806
|
-
sort_index = 0
|
|
807
|
-
|
|
808
|
-
if tid_name in self._map_tid_name_to_int.keys():
|
|
809
|
-
sort_index, tid = self._map_tid_name_to_int[tid_name]
|
|
810
|
-
elif tid_name.startswith("Stream"):
|
|
811
|
-
tid = int(tid_name.split("#")[-1])
|
|
812
|
-
sort_index = tid
|
|
813
|
-
else:
|
|
814
|
-
return
|
|
815
|
-
|
|
816
|
-
if self._host_cpu_op_label == tid_name[:len(self._host_cpu_op_label)]:
|
|
817
|
-
thread_name_meta_data['pid'] = self._HOST_CPU_PID
|
|
818
|
-
|
|
819
|
-
thread_name_meta_data["tid"] = tid
|
|
820
|
-
thread_name_meta_data["args"]["name"] = tid_name
|
|
821
|
-
thread_sort_meta_data = thread_name_meta_data.copy()
|
|
822
|
-
thread_sort_meta_data['name'] = "thread_sort_index"
|
|
823
|
-
thread_sort_meta_data["args"] = {"sort_index": sort_index}
|
|
824
|
-
timeline_dict["tid"] = tid
|
|
825
|
-
|
|
826
|
-
if tid_name in self._thread_processed_list:
|
|
827
|
-
return
|
|
828
|
-
|
|
829
|
-
self._thread_processed_list.append(tid_name)
|
|
830
|
-
self._format_meta_data_list.append(thread_name_meta_data)
|
|
831
|
-
self._format_meta_data_list.append(thread_sort_meta_data)
|
|
832
|
-
|
|
833
|
-
def _get_max_scope_name_num(self, timeline_list):
|
|
834
|
-
"""Get the max number of scope level from all operator."""
|
|
835
|
-
max_scope_name_num = 0
|
|
836
|
-
for time_item in timeline_list:
|
|
837
|
-
cur_scope_name_num = len(time_item[self._op_name_idx].split('/')) - 1
|
|
838
|
-
max_scope_name_num = max(cur_scope_name_num, max_scope_name_num)
|
|
839
|
-
|
|
840
|
-
return max_scope_name_num
|
|
841
|
-
|
|
842
|
-
def _get_scope_name_time_list(self, timeline_list, subgraph, factor_start_time_to_duration=1):
|
|
843
|
-
"""Produce the timeline of hierarchical scope name."""
|
|
844
|
-
# the key of scope_name_start_duration_dict is scope name, the value is a dict which store the
|
|
845
|
-
# start and end index of time_item in timeline_list.
|
|
846
|
-
scope_name_start_duration_dict = {}
|
|
847
|
-
scope_name_time_list = []
|
|
848
|
-
op_full_name_idx, scope_name_idx, invalid_idx = 0, 0, -1
|
|
849
|
-
tid = "Scope Name"
|
|
850
|
-
for idx, time_item in enumerate(timeline_list):
|
|
851
|
-
scope_name_list = time_item[op_full_name_idx].split('/')[:-1]
|
|
852
|
-
# skip Default/InitDataSetQueue operator.
|
|
853
|
-
if time_item[op_full_name_idx].startswith("Default/InitDataSetQueue"):
|
|
854
|
-
scope_name_list = []
|
|
855
|
-
# process scope name of subgraph(Default/Gradients/recompute_Default) only.
|
|
856
|
-
if scope_name_list and scope_name_list[0] != subgraph:
|
|
857
|
-
scope_name_list = []
|
|
858
|
-
# add the level of scope name, used to distinguish the same name at different scope level.
|
|
859
|
-
scope_name_list = [f"{scope_level}-{scope_name}"
|
|
860
|
-
for scope_level, scope_name in enumerate(scope_name_list)]
|
|
861
|
-
|
|
862
|
-
# update the start and end index of time_item according to current scope_name
|
|
863
|
-
for scope_name in scope_name_list:
|
|
864
|
-
init_start_end_idx_dict = {'start_item_idx': idx, 'end_item_idx': idx}
|
|
865
|
-
if scope_name not in scope_name_start_duration_dict:
|
|
866
|
-
scope_name_start_duration_dict[scope_name] = init_start_end_idx_dict
|
|
867
|
-
if scope_name_start_duration_dict[scope_name]['start_item_idx'] == invalid_idx:
|
|
868
|
-
scope_name_start_duration_dict[scope_name] = init_start_end_idx_dict
|
|
869
|
-
else:
|
|
870
|
-
scope_name_start_duration_dict[scope_name]['end_item_idx'] = idx
|
|
871
|
-
# if the key(scope name) in scope_name_start_duration_dict does not appear in scope_name_list,
|
|
872
|
-
# it means this key(scope name) is end and it is append to scope_name_time_list.
|
|
873
|
-
for key, val in scope_name_start_duration_dict.items():
|
|
874
|
-
if val['start_item_idx'] == invalid_idx:
|
|
875
|
-
continue
|
|
876
|
-
if (key not in scope_name_list) \
|
|
877
|
-
or idx == (len(timeline_list) - 1) \
|
|
878
|
-
or time_item[op_full_name_idx] == self._step_end_op_name:
|
|
879
|
-
start_time = timeline_list[val['start_item_idx']][self._start_time_idx]
|
|
880
|
-
duration = (float(timeline_list[val['end_item_idx']][self._start_time_idx]) - float(start_time)) * \
|
|
881
|
-
factor_start_time_to_duration + \
|
|
882
|
-
float(timeline_list[val['end_item_idx']][self._duration_idx])
|
|
883
|
-
scope_name_time_item = [key, tid, start_time, duration]
|
|
884
|
-
scope_name_time_list.append(scope_name_time_item)
|
|
885
|
-
scope_name_start_duration_dict[key]['start_item_idx'] = invalid_idx
|
|
886
|
-
|
|
887
|
-
# x[scope_name_idx] is a scope name like "0-Default".
|
|
888
|
-
# if two element in scope_name_time_list have the same start time,
|
|
889
|
-
# the previous element in list will displayed at the higher line in UI page.
|
|
890
|
-
scope_name_time_list.sort(
|
|
891
|
-
key=lambda x: (float(x[self._start_time_idx]), int(x[scope_name_idx].split('-')[0]))
|
|
892
|
-
)
|
|
893
|
-
|
|
894
|
-
return scope_name_time_list
|
|
895
|
-
|
|
896
|
-
def _set_step_start_and_end_op_name(self, timeline_list):
|
|
897
|
-
"""Set the start and end operator full name of each step."""
|
|
898
|
-
if not timeline_list:
|
|
899
|
-
return
|
|
900
|
-
start_op_idx = 0
|
|
901
|
-
if timeline_list[0][self._op_name_idx].startswith("Default/InitDataSetQueue"):
|
|
902
|
-
start_op_idx = 1
|
|
903
|
-
self._step_start_op_name = timeline_list[start_op_idx][self._op_name_idx]
|
|
904
|
-
self._step_end_op_name = self._step_start_op_name
|
|
905
|
-
if len(timeline_list) > (start_op_idx + 1):
|
|
906
|
-
for time_item in timeline_list[start_op_idx + 1:]:
|
|
907
|
-
if time_item[self._op_name_idx] != self._step_start_op_name:
|
|
908
|
-
self._step_end_op_name = time_item[self._op_name_idx]
|
|
909
|
-
else:
|
|
910
|
-
break
|
|
911
|
-
|
|
912
|
-
def _get_step_time_list(self, timeline_list, factor_start_time_to_duration=1):
|
|
913
|
-
"""Produce the time of each step."""
|
|
914
|
-
# Record the time of each step.
|
|
915
|
-
step_time_list = []
|
|
916
|
-
step_num = 1
|
|
917
|
-
tid = "Steps"
|
|
918
|
-
cur_step_start_time, cur_step_duration_time = 0, 0
|
|
919
|
-
for time_item in timeline_list:
|
|
920
|
-
if time_item[self._op_name_idx] == self._step_start_op_name:
|
|
921
|
-
cur_step_start_time = time_item[self._start_time_idx]
|
|
922
|
-
if time_item[self._op_name_idx] == self._step_end_op_name:
|
|
923
|
-
cur_step_duration_time = (float(time_item[self._start_time_idx]) - float(cur_step_start_time)) * \
|
|
924
|
-
float(factor_start_time_to_duration) + float(time_item[self._duration_idx])
|
|
925
|
-
step_time_item = [str(step_num), tid, float(cur_step_start_time), cur_step_duration_time]
|
|
926
|
-
step_time_list.append(step_time_item)
|
|
927
|
-
step_num += 1
|
|
928
|
-
|
|
929
|
-
return step_time_list
|
|
930
|
-
|
|
931
|
-
def _write_cluster_metrices(self, metrices, is_pipeline_parallel, device_target, dev_id):
|
|
932
|
-
"""Write cluster metric."""
|
|
933
|
-
# Note that the feature of cluster bottleneck analyse is not supported in offline parse mode,
|
|
934
|
-
# due to that parallel context is not set.
|
|
935
|
-
parallel_mode, stage_num = BaseTimelineGenerator.get_parallel_context()
|
|
936
|
-
|
|
937
|
-
unit = 1 if device_target == "Ascend" else 1e3
|
|
938
|
-
time_decimal_digits = 4
|
|
939
|
-
cluster_analyse_file_path = os.path.join(
|
|
940
|
-
self._profiling_dir,
|
|
941
|
-
self._cluster_analyse_filename.format(parallel_mode, stage_num, self._rank_size, dev_id)
|
|
942
|
-
)
|
|
943
|
-
cluster_analyse_file_path = validate_and_normalize_path(cluster_analyse_file_path)
|
|
944
|
-
|
|
945
|
-
try:
|
|
946
|
-
with open(cluster_analyse_file_path, 'w') as file_handle:
|
|
947
|
-
csv_writer = csv.writer(file_handle)
|
|
948
|
-
if is_pipeline_parallel:
|
|
949
|
-
header = ['computation_time', 'communication_alone_time', 'stage_time',
|
|
950
|
-
'receive_alone_time', 'collective_communication_alone_time']
|
|
951
|
-
zip_metrices = zip(metrices[0], metrices[1], metrices[2], metrices[3], metrices[4])
|
|
952
|
-
else:
|
|
953
|
-
header = ['computation_time', 'communication_alone_time']
|
|
954
|
-
zip_metrices = zip(metrices[0], metrices[1])
|
|
955
|
-
csv_writer.writerow(header)
|
|
956
|
-
for row_data in zip_metrices:
|
|
957
|
-
row_data = [round(val / unit, time_decimal_digits) for val in row_data]
|
|
958
|
-
csv_writer.writerow(row_data)
|
|
959
|
-
os.chmod(cluster_analyse_file_path, stat.S_IREAD | stat.S_IWRITE)
|
|
960
|
-
except (IOError, OSError) as err:
|
|
961
|
-
logger.warning(f'Failed to save {cluster_analyse_file_path}. {err}')
|
|
962
|
-
raise ProfilerIOException from err
|
|
963
|
-
|
|
964
|
-
def _register_op_name(self, timeline_list):
|
|
965
|
-
"""Register op name to op name list."""
|
|
966
|
-
for timeline in timeline_list:
|
|
967
|
-
if timeline and timeline[self._op_name_idx] not in self._op_name_list:
|
|
968
|
-
self._op_name_list.append(timeline[self._op_name_idx])
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
class GpuTimelineGenerator(BaseTimelineGenerator):
|
|
972
|
-
"""Generate gpu Timeline data from file."""
|
|
973
|
-
_display_filename = 'gpu_timeline_display_{}.json'
|
|
974
|
-
_timeline_summary_filename = 'gpu_timeline_summary_{}.json'
|
|
975
|
-
_output_op_execute_time_file_path = "gpu_op_execute_timestamp_{}.txt"
|
|
976
|
-
_output_activity_execute_time_file_path = "activity_execute_timestamp_{}.txt"
|
|
977
|
-
_output_gpu_activity_info_file_path = "gpu_activity_data_{}.csv"
|
|
978
|
-
_step_trace_original_filename = 'step_trace_profiling_{}.txt'
|
|
979
|
-
_cluster_analyse_filename = 'gpu_cluster_analyse_{}_{}_{}_{}.csv'
|
|
980
|
-
_activity_keys_list = []
|
|
981
|
-
|
|
982
|
-
def __init__(self, profiling_dir, device_id, rank_size, model):
|
|
983
|
-
super().__init__(DeviceTarget.GPU.value, model)
|
|
984
|
-
self._device_id = device_id
|
|
985
|
-
self._rank_size = rank_size
|
|
986
|
-
self._profiling_dir = profiling_dir
|
|
987
|
-
self._device_id = device_id
|
|
988
|
-
self._timeline_meta = []
|
|
989
|
-
self._display_filename = self._display_filename.format(device_id)
|
|
990
|
-
self._timeline_summary_filename = self._timeline_summary_filename.format(device_id)
|
|
991
|
-
self._tid_dict = {
|
|
992
|
-
"receive_op_not_overlapped": (self._RECEIVE_ALONE, self._OP_OVERLAP_PID),
|
|
993
|
-
"exclude_receive_op": (self._ALLREDUCE_ALONE, self._OP_OVERLAP_PID),
|
|
994
|
-
"computation_op": (self._MERGED_COMPUTATION_TID, self._OP_OVERLAP_PID),
|
|
995
|
-
"communication_not_overlapped": (self._PURE_COMMUNICATION_TID, self._OP_OVERLAP_PID),
|
|
996
|
-
"communication": (self._MERGED_COMMUNICATION_TID, self._OP_OVERLAP_PID),
|
|
997
|
-
"free_time": (self._FREE_TIME_TID, self._OP_OVERLAP_PID)
|
|
998
|
-
}
|
|
999
|
-
|
|
1000
|
-
def init_timeline(self, reduce_op_type):
|
|
1001
|
-
"""Init timeline metadata, adding all collected info."""
|
|
1002
|
-
timeline_list = self._load_timeline_data(reduce_op_type)
|
|
1003
|
-
|
|
1004
|
-
# Init a dict for counting the num of streams.
|
|
1005
|
-
stream_count_dict = {}
|
|
1006
|
-
for timeline in timeline_list:
|
|
1007
|
-
self._parse_timeline_data(timeline, 0)
|
|
1008
|
-
# Updating the collection of streams.
|
|
1009
|
-
if len(timeline) == 4:
|
|
1010
|
-
self._update_num_of_streams(timeline, stream_count_dict)
|
|
1011
|
-
|
|
1012
|
-
# Add format thread meta data.
|
|
1013
|
-
self._format_meta_data_list.extend(self._timeline_meta)
|
|
1014
|
-
self._timeline_meta = self._format_meta_data_list
|
|
1015
|
-
|
|
1016
|
-
# Update timeline summary info
|
|
1017
|
-
self._timeline_summary['num_of_streams'] += len(stream_count_dict)
|
|
1018
|
-
|
|
1019
|
-
def check_op_name(self, op_name):
|
|
1020
|
-
"""
|
|
1021
|
-
Check whether the operator name exists.
|
|
1022
|
-
|
|
1023
|
-
Args:
|
|
1024
|
-
op_name (str): The operator name or operator name prefix.
|
|
1025
|
-
|
|
1026
|
-
Returns:
|
|
1027
|
-
bool, `True` if the operator name does exist, else `False`.
|
|
1028
|
-
"""
|
|
1029
|
-
if not op_name:
|
|
1030
|
-
raise ProfilerParamValueErrorException('The op_name should exist.')
|
|
1031
|
-
for op_time_info in self._timeline_meta:
|
|
1032
|
-
full_op_name = op_time_info['name']
|
|
1033
|
-
if full_op_name and full_op_name.startswith(op_name):
|
|
1034
|
-
return True
|
|
1035
|
-
return False
|
|
1036
|
-
|
|
1037
|
-
def is_gpu_kernel_async_launch(self):
|
|
1038
|
-
"""Recognize the solution that launch the gpu kernel async."""
|
|
1039
|
-
step_trace_profiling_path = self._get_and_validate_path(
|
|
1040
|
-
self._step_trace_original_filename
|
|
1041
|
-
)
|
|
1042
|
-
try:
|
|
1043
|
-
with open(step_trace_profiling_path, 'r') as f_obj:
|
|
1044
|
-
line = next(f_obj)
|
|
1045
|
-
first_string = line.strip().split()[0]
|
|
1046
|
-
# the data format of launch the gpu kernel async is "Default/op1,160123 op-name"
|
|
1047
|
-
# otherwise, the data format is "Default/op1 160123,12 "
|
|
1048
|
-
return bool(len(first_string.split(',')) == 2)
|
|
1049
|
-
except (IOError, OSError) as err:
|
|
1050
|
-
logger.critical(f'Error occurred when read {step_trace_profiling_path}: {err}')
|
|
1051
|
-
raise ProfilerIOException() from err
|
|
1052
|
-
except StopIteration:
|
|
1053
|
-
logger.warning('No step trace data exists.')
|
|
1054
|
-
return False
|
|
1055
|
-
|
|
1056
|
-
def _get_and_validate_path(self, file_name):
|
|
1057
|
-
"""Generate op or activity file path from file name, and validate this path."""
|
|
1058
|
-
file_path = os.path.join(
|
|
1059
|
-
self._profiling_dir,
|
|
1060
|
-
file_name.format(self._device_id)
|
|
1061
|
-
)
|
|
1062
|
-
file_path = validate_and_normalize_path(file_path)
|
|
1063
|
-
if not os.path.exists(file_path):
|
|
1064
|
-
logger.critical(f"Failed to find parsed timeline file {file_path}.")
|
|
1065
|
-
raise ProfilerFileNotFoundException('parsed timeline file')
|
|
1066
|
-
|
|
1067
|
-
return file_path
|
|
1068
|
-
|
|
1069
|
-
def _parse_timeline_data(self, timeline, min_cycle_counter):
|
|
1070
|
-
"""Parse timeline data."""
|
|
1071
|
-
# factor to convert the time unit of start_time(ts) from 1ns to 1us for timeline display
|
|
1072
|
-
factor = 1000
|
|
1073
|
-
op_meta = TimelineContainer(timeline)
|
|
1074
|
-
timeline_dict = {}
|
|
1075
|
-
timeline_dict['name'] = op_meta.op_name.split('/')[-1]
|
|
1076
|
-
timeline_dict['ph'] = 'X'
|
|
1077
|
-
timeline_dict['tid'] = op_meta.stream_id
|
|
1078
|
-
timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) / factor
|
|
1079
|
-
dur = op_meta.duration
|
|
1080
|
-
timeline_dict['dur'] = dur
|
|
1081
|
-
if op_meta.pid is None:
|
|
1082
|
-
timeline_dict['pid'] = int(self._device_id)
|
|
1083
|
-
else:
|
|
1084
|
-
timeline_dict['pid'] = op_meta.pid
|
|
1085
|
-
if op_meta.stream_id == "Scope Name":
|
|
1086
|
-
# remove the level of scope name which has a format like "0-conv2-Conv2d".
|
|
1087
|
-
timeline_dict['name'] = "-".join(op_meta.op_name.split('-')[1:])
|
|
1088
|
-
timeline_dict['scope_level'] = int(op_meta.op_name.split('-')[0])
|
|
1089
|
-
elif op_meta.stream_id[:len(self._host_cpu_op_label)] == self._host_cpu_op_label:
|
|
1090
|
-
timeline_dict['pid'] = self._HOST_CPU_PID
|
|
1091
|
-
|
|
1092
|
-
if len(timeline) > 4:
|
|
1093
|
-
# len(timeline) > 4 refers to activity data, else op data.
|
|
1094
|
-
# Add args for activity data
|
|
1095
|
-
args_dict = {}
|
|
1096
|
-
for ix, value in enumerate(timeline[4:]):
|
|
1097
|
-
args_dict[self._activity_keys_list[ix]] = value
|
|
1098
|
-
timeline_dict['args'] = args_dict
|
|
1099
|
-
timeline_dict['tid'] = f"Stream #{timeline_dict['tid']}"
|
|
1100
|
-
elif op_meta.stream_id not in ["Scope Name", "Steps"]:
|
|
1101
|
-
# Update total time of operator execution.
|
|
1102
|
-
self._timeline_summary['total_time'] += dur / factor
|
|
1103
|
-
self._timeline_summary['op_exe_times'] += 1
|
|
1104
|
-
|
|
1105
|
-
self._update_format_meta_data(timeline_dict)
|
|
1106
|
-
self._timeline_meta.append(timeline_dict)
|
|
1107
|
-
|
|
1108
|
-
def _load_timeline_data(self, reduce_op_type):
|
|
1109
|
-
"""Load timeline data from file."""
|
|
1110
|
-
op_file_path = self._get_and_validate_path(
|
|
1111
|
-
self._output_op_execute_time_file_path)
|
|
1112
|
-
activity_file_path = self._get_and_validate_path(
|
|
1113
|
-
self._output_activity_execute_time_file_path)
|
|
1114
|
-
activity_args_file_path = self._get_and_validate_path(
|
|
1115
|
-
self._output_gpu_activity_info_file_path)
|
|
1116
|
-
|
|
1117
|
-
timeline_list, communication_info = self._load_op_data(op_file_path, reduce_op_type)
|
|
1118
|
-
communication_info.sort(key=lambda x: float(x[2]))
|
|
1119
|
-
# Add host cpu op timeline.
|
|
1120
|
-
cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._model)
|
|
1121
|
-
cpu_timeline_list = cpu_timeline_generator.load_cpu_op_data()
|
|
1122
|
-
if cpu_timeline_list:
|
|
1123
|
-
self._clock_synchronize_to_gpu(cpu_timeline_list)
|
|
1124
|
-
timeline_list.extend(cpu_timeline_list)
|
|
1125
|
-
timeline_list.sort(key=lambda x: float(x[2]))
|
|
1126
|
-
self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
|
|
1127
|
-
self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
|
|
1128
|
-
|
|
1129
|
-
# Generate step time.
|
|
1130
|
-
factor_start_time_uint_to_duration = 1e-3
|
|
1131
|
-
self._set_step_start_and_end_op_name(timeline_list)
|
|
1132
|
-
# Fit gpu kernel async launch solution.
|
|
1133
|
-
if self.is_gpu_kernel_async_launch():
|
|
1134
|
-
step_time_list = self._get_step_time_list_from_step_trace()
|
|
1135
|
-
else:
|
|
1136
|
-
step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
|
|
1137
|
-
|
|
1138
|
-
# Add Scope Name.
|
|
1139
|
-
default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
|
|
1140
|
-
factor_start_time_uint_to_duration)
|
|
1141
|
-
gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
|
|
1142
|
-
factor_start_time_uint_to_duration)
|
|
1143
|
-
recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
|
|
1144
|
-
factor_start_time_uint_to_duration)
|
|
1145
|
-
|
|
1146
|
-
activity_timeline_list, cuda_compute_ops_timeline_list = self._load_activity_data( \
|
|
1147
|
-
activity_file_path, activity_args_file_path)
|
|
1148
|
-
|
|
1149
|
-
# Add AllReduce info to timeline temp list and sort by start time.
|
|
1150
|
-
if communication_info:
|
|
1151
|
-
logger.debug('Allreduce info found, Start adding info to timeline...')
|
|
1152
|
-
cluster_related_timeline = self._get_cluster_timeline(
|
|
1153
|
-
timeline_list, cuda_compute_ops_timeline_list, communication_info, step_time_list)
|
|
1154
|
-
timeline_list.extend(cluster_related_timeline)
|
|
1155
|
-
timeline_list.extend(communication_info)
|
|
1156
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1157
|
-
|
|
1158
|
-
timeline_list.extend(default_scope_name_time_list)
|
|
1159
|
-
timeline_list.extend(gradient_scope_name_time_list)
|
|
1160
|
-
timeline_list.extend(recompute_scope_name_time_list)
|
|
1161
|
-
timeline_list.extend(step_time_list)
|
|
1162
|
-
|
|
1163
|
-
timeline_list.sort(key=lambda x: (float(x[self._start_time_idx])))
|
|
1164
|
-
|
|
1165
|
-
# Add cuda activity timeline.
|
|
1166
|
-
timeline_list.extend(activity_timeline_list)
|
|
1167
|
-
timeline_list.sort(key=lambda x: float(x[2]))
|
|
1168
|
-
|
|
1169
|
-
return timeline_list
|
|
1170
|
-
|
|
1171
|
-
def _clock_synchronize_to_gpu(self, timeline_list):
|
|
1172
|
-
"""Synchronize the timestamp from device to host."""
|
|
1173
|
-
start_time_file_path = os.path.join(self._profiling_dir, f"start_time_{self._device_id}.txt")
|
|
1174
|
-
|
|
1175
|
-
try:
|
|
1176
|
-
with open(start_time_file_path) as f_obj:
|
|
1177
|
-
lines = f_obj.readlines()
|
|
1178
|
-
# lines[0] stores the host monotonic time of start training.
|
|
1179
|
-
host_monotonic_start_time = int(lines[0].strip().split(':')[-1])
|
|
1180
|
-
# lines[1] stores the gpu time of start training.
|
|
1181
|
-
gpu_start_time = int(lines[1].strip().split(':')[-1])
|
|
1182
|
-
except (IOError, OSError) as err:
|
|
1183
|
-
logger.critical(f'Error occurred when read {start_time_file_path}: {err}')
|
|
1184
|
-
raise ProfilerIOException() from err
|
|
1185
|
-
|
|
1186
|
-
time_diff = gpu_start_time - host_monotonic_start_time
|
|
1187
|
-
for idx, time_item in enumerate(timeline_list):
|
|
1188
|
-
timeline_list[idx][self._start_time_idx] = int(time_item[self._start_time_idx]) + time_diff
|
|
1189
|
-
|
|
1190
|
-
def _load_op_data(self, op_file_path, reduce_op_type):
|
|
1191
|
-
"""Load operator data from file"""
|
|
1192
|
-
op_timeline_list = []
|
|
1193
|
-
communication_info = []
|
|
1194
|
-
try:
|
|
1195
|
-
with open(op_file_path, 'r') as f_obj:
|
|
1196
|
-
for line in f_obj:
|
|
1197
|
-
self._timeline_summary['num_of_ops'] += 1
|
|
1198
|
-
op_list = line.strip('\n').strip().split(';')
|
|
1199
|
-
time_arr = op_list[-1]
|
|
1200
|
-
time_arr = time_arr.split(" ")
|
|
1201
|
-
for time in time_arr:
|
|
1202
|
-
time = time.split(",")
|
|
1203
|
-
line_list = op_list[:2] + time
|
|
1204
|
-
communication_op_name = line_list[0].strip().split('/')[-1]
|
|
1205
|
-
if communication_op_name not in reduce_op_type:
|
|
1206
|
-
op_timeline_list.append(line_list)
|
|
1207
|
-
else:
|
|
1208
|
-
communication_info.append(line_list)
|
|
1209
|
-
except (IOError, OSError) as err:
|
|
1210
|
-
logger.critical('Error occurred when load operator timeline data intermediate file: %s', err)
|
|
1211
|
-
raise ProfilerIOException() from err
|
|
1212
|
-
|
|
1213
|
-
return op_timeline_list, communication_info
|
|
1214
|
-
|
|
1215
|
-
def _load_activity_data(self, activity_file_path, activity_args_file_path):
|
|
1216
|
-
"""Load activity data from file"""
|
|
1217
|
-
activity_timeline_list = []
|
|
1218
|
-
cuda_compute_ops_timeline_list = []
|
|
1219
|
-
args_dict = {}
|
|
1220
|
-
try:
|
|
1221
|
-
with open(activity_args_file_path, 'r') as args_file:
|
|
1222
|
-
csv_reader = csv.reader(args_file)
|
|
1223
|
-
keys_list = next(csv_reader)
|
|
1224
|
-
# keys_list [name, type, op_full_name, stream_id, block_dim, grid_dim, ...]
|
|
1225
|
-
self._activity_keys_list = keys_list[1:3] + keys_list[4:6]
|
|
1226
|
-
for info in csv_reader:
|
|
1227
|
-
args_dict[info[0]] = info[1:3] + info[4:6]
|
|
1228
|
-
with open(activity_file_path, 'r') as f_obj:
|
|
1229
|
-
for line in f_obj:
|
|
1230
|
-
line_list = line.strip('\n').split(';')
|
|
1231
|
-
# concat activity args info.
|
|
1232
|
-
line_list += args_dict[line_list[0]]
|
|
1233
|
-
if not line_list[0].startswith('nccl'):
|
|
1234
|
-
cuda_compute_ops_timeline_list.append(line_list)
|
|
1235
|
-
activity_timeline_list.append(line_list)
|
|
1236
|
-
except (IOError, OSError) as err:
|
|
1237
|
-
logger.critical('Error occurred when load activity timeline data intermediate file: %s', err)
|
|
1238
|
-
raise ProfilerIOException() from err
|
|
1239
|
-
|
|
1240
|
-
return activity_timeline_list, cuda_compute_ops_timeline_list
|
|
1241
|
-
|
|
1242
|
-
def _get_step_time_list_from_step_trace(self):
|
|
1243
|
-
"""Produce the time of each step based on step_trace_profiling file."""
|
|
1244
|
-
# Record the time of each step.
|
|
1245
|
-
step_time_list = []
|
|
1246
|
-
step_start_op_name = []
|
|
1247
|
-
step_end_op_name = []
|
|
1248
|
-
step_num = 1
|
|
1249
|
-
tid = "Steps"
|
|
1250
|
-
step_trace_profiling_path = self._get_and_validate_path(
|
|
1251
|
-
self._step_trace_original_filename
|
|
1252
|
-
)
|
|
1253
|
-
|
|
1254
|
-
try:
|
|
1255
|
-
with open(step_trace_profiling_path, 'r') as f_obj:
|
|
1256
|
-
for line in f_obj:
|
|
1257
|
-
line = line.strip().split()
|
|
1258
|
-
step_start_op_name.append(line[0].split(',')[0])
|
|
1259
|
-
step_end_op_name.append(line[3].split(',')[0])
|
|
1260
|
-
cur_step_start_time = float(line[0].split(',')[1])
|
|
1261
|
-
cur_step_end_time = float(line[3].split(',')[1])
|
|
1262
|
-
# convert duration time unit from ns to us.
|
|
1263
|
-
cur_step_duration_time = (cur_step_end_time - cur_step_start_time) / 1e3
|
|
1264
|
-
step_time_item = [str(step_num), tid, cur_step_start_time, cur_step_duration_time]
|
|
1265
|
-
step_time_list.append(step_time_item)
|
|
1266
|
-
step_num += 1
|
|
1267
|
-
except (IOError, OSError) as err:
|
|
1268
|
-
logger.critical(f'Error occurred when read {step_trace_profiling_path}: {err}')
|
|
1269
|
-
raise ProfilerIOException() from err
|
|
1270
|
-
|
|
1271
|
-
return step_time_list
|
|
1272
|
-
|
|
1273
|
-
def _get_cluster_timeline(self, timeline, activity_info, comm_info, step_info):
|
|
1274
|
-
"""
|
|
1275
|
-
Analyse the cluster communication and computation data, and write result to file.
|
|
1276
|
-
|
|
1277
|
-
To analyse the cluster performance bottleneck based on timeline, define the time of a training
|
|
1278
|
-
step as "t_total", propose five metrics as follows:
|
|
1279
|
-
1) The time that "receive" operators not overlapped by others(t1)
|
|
1280
|
-
2) The time that is consumed inside the stage(t_total - t1)
|
|
1281
|
-
3) The time that "communication" operators not overlapped by others(t2)
|
|
1282
|
-
4) The time that consumed by computation(t_total - t2)
|
|
1283
|
-
5) The time that "collective communication" operators not overlapped by others(t3)
|
|
1284
|
-
In pipeline parallel mode, we can locate slow stage based on t_total - t1. Inside each stage,
|
|
1285
|
-
we can locate slow card based on t_total - t2. The value of t1 indicates the degree that
|
|
1286
|
-
communication time between stages slow down the training. The value of t3 indicates the degree
|
|
1287
|
-
that communication inside each stage slow down the training.
|
|
1288
|
-
"""
|
|
1289
|
-
step_num = len(step_info)
|
|
1290
|
-
is_pipeline_parallel = False
|
|
1291
|
-
comm_merged_timeline, _, comm_display_timeline = self._get_merged_time_list(
|
|
1292
|
-
comm_info,
|
|
1293
|
-
display_name="communication",
|
|
1294
|
-
factor=1e-3
|
|
1295
|
-
)
|
|
1296
|
-
compute_op_timeline = timeline + activity_info
|
|
1297
|
-
compute_op_timeline.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1298
|
-
compute_op_timeline_interval, _, compute_op_display_timeline = self._get_merged_time_list(
|
|
1299
|
-
compute_op_timeline,
|
|
1300
|
-
get_interval_time=True,
|
|
1301
|
-
factor=1e-3
|
|
1302
|
-
)
|
|
1303
|
-
# Consider if the overlap will be 0 or not.
|
|
1304
|
-
comm_not_overlapped_timeline = self._get_intersection_time(
|
|
1305
|
-
compute_op_timeline_interval,
|
|
1306
|
-
comm_merged_timeline
|
|
1307
|
-
)
|
|
1308
|
-
|
|
1309
|
-
# Process receive part.
|
|
1310
|
-
all_timeline = timeline + comm_info
|
|
1311
|
-
all_timeline.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1312
|
-
receive_op_timeline = self._produce_two_separated_timeline(
|
|
1313
|
-
all_timeline,
|
|
1314
|
-
"Receive-op"
|
|
1315
|
-
)[0]
|
|
1316
|
-
if receive_op_timeline:
|
|
1317
|
-
is_pipeline_parallel = True
|
|
1318
|
-
receive_op_merged_timeline = self._get_merged_time_list(receive_op_timeline,
|
|
1319
|
-
factor=1e-3)[0]
|
|
1320
|
-
|
|
1321
|
-
receive_op_not_overlapped_timeline = self._get_intersection_time(
|
|
1322
|
-
compute_op_timeline_interval,
|
|
1323
|
-
receive_op_merged_timeline,
|
|
1324
|
-
display_name="receive_op_not_overlapped"
|
|
1325
|
-
)
|
|
1326
|
-
|
|
1327
|
-
# Process collective communication part.
|
|
1328
|
-
collective_comm_timeline = self._produce_two_separated_timeline(
|
|
1329
|
-
comm_info,
|
|
1330
|
-
"Receive-op"
|
|
1331
|
-
)[-1]
|
|
1332
|
-
collective_comm_merged_timeline = self._get_merged_time_list(collective_comm_timeline,
|
|
1333
|
-
factor=1e-3)[0]
|
|
1334
|
-
collective_comm_not_overlapped_timeline = self._get_intersection_time(
|
|
1335
|
-
compute_op_timeline_interval,
|
|
1336
|
-
collective_comm_merged_timeline,
|
|
1337
|
-
display_name="exclude_receive_op"
|
|
1338
|
-
)
|
|
1339
|
-
|
|
1340
|
-
# Generate free time that exclude computation and communication time.
|
|
1341
|
-
all_timeline = compute_op_timeline + comm_info
|
|
1342
|
-
all_timeline.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1343
|
-
free_timeline = self._get_merged_time_list(
|
|
1344
|
-
all_timeline,
|
|
1345
|
-
get_interval_time=True,
|
|
1346
|
-
display_name="free_time",
|
|
1347
|
-
factor=1e-3
|
|
1348
|
-
)[1]
|
|
1349
|
-
|
|
1350
|
-
# Compute these five metrics mentioned above per step.
|
|
1351
|
-
recieve_alone_time = self._compute_time_inside_step(receive_op_not_overlapped_timeline, step_info)
|
|
1352
|
-
stage_time, computation_time = [], []
|
|
1353
|
-
comm_alone_time = self._compute_time_inside_step(comm_not_overlapped_timeline, step_info)
|
|
1354
|
-
collective_comm_alone_time = self._compute_time_inside_step(
|
|
1355
|
-
collective_comm_not_overlapped_timeline,
|
|
1356
|
-
step_info
|
|
1357
|
-
)
|
|
1358
|
-
for step in range(step_num):
|
|
1359
|
-
try:
|
|
1360
|
-
if is_pipeline_parallel:
|
|
1361
|
-
stage_time.append(step_info[step][self._duration_idx] - recieve_alone_time[step])
|
|
1362
|
-
computation_time.append(step_info[step][self._duration_idx] - comm_alone_time[step])
|
|
1363
|
-
except IndexError as e:
|
|
1364
|
-
logger.error(e)
|
|
1365
|
-
|
|
1366
|
-
metrices_per_step_list = [computation_time, comm_alone_time, stage_time,
|
|
1367
|
-
recieve_alone_time, collective_comm_alone_time]
|
|
1368
|
-
if step_num > 1:
|
|
1369
|
-
for metric in metrices_per_step_list:
|
|
1370
|
-
metric.append(sum(metric[1:]) / (step_num - 1))
|
|
1371
|
-
self._write_cluster_metrices(metrices_per_step_list, is_pipeline_parallel, "Gpu", self._device_id)
|
|
1372
|
-
|
|
1373
|
-
res_timeline = []
|
|
1374
|
-
res_timeline.extend(comm_not_overlapped_timeline)
|
|
1375
|
-
res_timeline.extend(compute_op_display_timeline)
|
|
1376
|
-
res_timeline.extend(comm_display_timeline)
|
|
1377
|
-
res_timeline.extend(free_timeline)
|
|
1378
|
-
return res_timeline
|
|
1379
|
-
|
|
1380
|
-
def _compute_time_inside_step(self, metric_timeline, step_time_list):
|
|
1381
|
-
"""Compute per step time of metric_timeline."""
|
|
1382
|
-
per_step_time_list = []
|
|
1383
|
-
step = 0
|
|
1384
|
-
cur_step_metric_time = 0
|
|
1385
|
-
factor_us_to_ns = 1e3
|
|
1386
|
-
step_end_time = step_time_list[step][self._start_time_idx] + \
|
|
1387
|
-
step_time_list[step][self._duration_idx] * factor_us_to_ns
|
|
1388
|
-
for time_item in metric_timeline:
|
|
1389
|
-
start_time = time_item[self._start_time_idx]
|
|
1390
|
-
if start_time > step_end_time:
|
|
1391
|
-
per_step_time_list.append(cur_step_metric_time)
|
|
1392
|
-
step += 1
|
|
1393
|
-
if step >= len(step_time_list):
|
|
1394
|
-
logger.warning("Compute profiler compute_time_inside_step time, "
|
|
1395
|
-
"find the data length is more than step count, "
|
|
1396
|
-
"maybe current graph has multi sub graph, skip the last data.")
|
|
1397
|
-
break
|
|
1398
|
-
step_end_time = step_time_list[step][self._start_time_idx] + \
|
|
1399
|
-
step_time_list[step][self._duration_idx] * factor_us_to_ns
|
|
1400
|
-
cur_step_metric_time = 0
|
|
1401
|
-
cur_step_metric_time += time_item[self._duration_idx]
|
|
1402
|
-
per_step_time_list.append(cur_step_metric_time)
|
|
1403
|
-
|
|
1404
|
-
return per_step_time_list
|
|
1405
|
-
|
|
1406
|
-
def _get_intersection_time(self, first_time_list, second_time_list,
|
|
1407
|
-
display_name="communication_not_overlapped"):
|
|
1408
|
-
"""Get intersection time of two time list."""
|
|
1409
|
-
first_list_idx, second_list_idx = 0, 0
|
|
1410
|
-
first_list_len = len(first_time_list)
|
|
1411
|
-
second_list_len = len(second_time_list)
|
|
1412
|
-
intersection_segment_display_list = []
|
|
1413
|
-
factor_ns_to_us = 1e-3
|
|
1414
|
-
while first_list_idx < first_list_len and second_list_idx < second_list_len:
|
|
1415
|
-
intersection_start = max(
|
|
1416
|
-
first_time_list[first_list_idx][self._start_time_idx],
|
|
1417
|
-
second_time_list[second_list_idx][self._start_time_idx]
|
|
1418
|
-
)
|
|
1419
|
-
intersection_end = min(
|
|
1420
|
-
first_time_list[first_list_idx][self._duration_idx],
|
|
1421
|
-
second_time_list[second_list_idx][self._duration_idx]
|
|
1422
|
-
)
|
|
1423
|
-
if intersection_start < intersection_end:
|
|
1424
|
-
intersection_segment_display_list.append(
|
|
1425
|
-
[display_name, self._tid_dict[display_name][0],
|
|
1426
|
-
intersection_start, (intersection_end - intersection_start) * factor_ns_to_us,
|
|
1427
|
-
self._tid_dict[display_name][1]]
|
|
1428
|
-
)
|
|
1429
|
-
if first_time_list[first_list_idx][self._duration_idx] >= \
|
|
1430
|
-
second_time_list[second_list_idx][self._duration_idx]:
|
|
1431
|
-
second_list_idx += 1
|
|
1432
|
-
else:
|
|
1433
|
-
first_list_idx += 1
|
|
1434
|
-
|
|
1435
|
-
return intersection_segment_display_list
|
|
1436
|
-
|
|
1437
|
-
def _produce_two_separated_timeline(self, timeline, op_name):
|
|
1438
|
-
"""Produce two separated timeline based on op_name."""
|
|
1439
|
-
timeline_include_op_name = []
|
|
1440
|
-
timeline_exclude_op_name = []
|
|
1441
|
-
for time_item in timeline:
|
|
1442
|
-
if op_name in time_item[self._op_name_idx]:
|
|
1443
|
-
timeline_include_op_name.append(time_item)
|
|
1444
|
-
else:
|
|
1445
|
-
timeline_exclude_op_name.append(time_item)
|
|
1446
|
-
return timeline_include_op_name, timeline_exclude_op_name
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
class AscendTimelineGenerator(BaseTimelineGenerator):
|
|
1450
|
-
"""Generate ascend Timeline data from file."""
|
|
1451
|
-
_display_filename = 'ascend_timeline_display_{}.json'
|
|
1452
|
-
_timeline_summary_filename = 'ascend_timeline_summary_{}.json'
|
|
1453
|
-
_cluster_analyse_filename = 'ascend_cluster_analyse_{}_{}_{}_{}.csv'
|
|
1454
|
-
|
|
1455
|
-
def __init__(self, profiling_dir, device_id, rank_id, rank_size, model):
|
|
1456
|
-
super().__init__(DeviceTarget.ASCEND.value, model)
|
|
1457
|
-
self._profiling_dir = profiling_dir
|
|
1458
|
-
self._device_id = device_id
|
|
1459
|
-
self._rank_id = rank_id
|
|
1460
|
-
self._rank_size = rank_size
|
|
1461
|
-
self._display_filename = self._display_filename.format(rank_id)
|
|
1462
|
-
self._timeline_summary_filename = self._timeline_summary_filename.format(rank_id)
|
|
1463
|
-
|
|
1464
|
-
@staticmethod
|
|
1465
|
-
def _get_all_reduce_names(communication_info):
|
|
1466
|
-
names = []
|
|
1467
|
-
for info in communication_info:
|
|
1468
|
-
# all_reduce_name format: stream_stream_id_stream_op_index_opname
|
|
1469
|
-
all_reduce_name = info[0][info[0].rindex('_') + 1:]
|
|
1470
|
-
if all_reduce_name not in names:
|
|
1471
|
-
names.append(all_reduce_name)
|
|
1472
|
-
return names
|
|
1473
|
-
|
|
1474
|
-
def init_timeline(self, communication_info, framework_info, aicpu_info, min_cycle_counter, source_path):
|
|
1475
|
-
"""
|
|
1476
|
-
Init timeline metadata, adding all collected info.
|
|
1477
|
-
|
|
1478
|
-
Args:
|
|
1479
|
-
communication_info (list[list]): The metadata of communication operator.
|
|
1480
|
-
framework_info (dict): The framework metadata.
|
|
1481
|
-
aicpu_info (dict): The metadata of AI CPU operator.
|
|
1482
|
-
min_cycle_counter (float): The minimum cycle counter of the timeline.
|
|
1483
|
-
source_path (str): The source of file.
|
|
1484
|
-
"""
|
|
1485
|
-
if min_cycle_counter == float('inf'):
|
|
1486
|
-
min_cycle_counter = 0
|
|
1487
|
-
|
|
1488
|
-
logger.info('Initiating timeline...')
|
|
1489
|
-
timeline_list = []
|
|
1490
|
-
op_timeline_list = self._get_op_timeline(communication_info, source_path)
|
|
1491
|
-
timeline_list.extend(op_timeline_list)
|
|
1492
|
-
|
|
1493
|
-
# Generate step time.
|
|
1494
|
-
self._set_step_start_and_end_op_name(timeline_list)
|
|
1495
|
-
step_time_list = self._get_step_time_list(timeline_list)
|
|
1496
|
-
|
|
1497
|
-
# Add Scope Name.
|
|
1498
|
-
default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default")
|
|
1499
|
-
gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients")
|
|
1500
|
-
recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default")
|
|
1501
|
-
|
|
1502
|
-
# Add AI CPU data into timeline temp list and sort by start time.
|
|
1503
|
-
aicpu_data = aicpu_info.get('info')
|
|
1504
|
-
if aicpu_data:
|
|
1505
|
-
timeline_list.extend(aicpu_data)
|
|
1506
|
-
self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0)
|
|
1507
|
-
self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0)
|
|
1508
|
-
self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0)
|
|
1509
|
-
self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0)
|
|
1510
|
-
|
|
1511
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1512
|
-
|
|
1513
|
-
# Add AllReduce info to timeline temp list and sort by start time.
|
|
1514
|
-
if communication_info:
|
|
1515
|
-
logger.debug('AllReduce info found. Start adding info into timeline...')
|
|
1516
|
-
cluster_related_timeline = self._get_cluster_timeline(
|
|
1517
|
-
timeline_list, communication_info, step_time_list)
|
|
1518
|
-
timeline_list.extend(cluster_related_timeline)
|
|
1519
|
-
timeline_list.extend(communication_info)
|
|
1520
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1521
|
-
|
|
1522
|
-
# Add step time and scope name info.
|
|
1523
|
-
timeline_list.extend(step_time_list)
|
|
1524
|
-
timeline_list.extend(default_scope_name_time_list)
|
|
1525
|
-
timeline_list.extend(recompute_scope_name_time_list)
|
|
1526
|
-
timeline_list.extend(gradient_scope_name_time_list)
|
|
1527
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1528
|
-
|
|
1529
|
-
# Init a dict for counting the num of streams.
|
|
1530
|
-
stream_count_dict = {}
|
|
1531
|
-
for timeline in timeline_list:
|
|
1532
|
-
self._parse_timeline_data(timeline, min_cycle_counter)
|
|
1533
|
-
# Updating the collection of streams.
|
|
1534
|
-
if len(timeline) == 4:
|
|
1535
|
-
self._update_num_of_streams(timeline, stream_count_dict)
|
|
1536
|
-
|
|
1537
|
-
# Add format thread meta data.
|
|
1538
|
-
self._format_meta_data_list.extend(self._timeline_meta)
|
|
1539
|
-
self._timeline_meta = self._format_meta_data_list
|
|
1540
|
-
# Get framework metadata.
|
|
1541
|
-
framework_obj_list = framework_info.get('object')
|
|
1542
|
-
# The length of list is the number of operators.
|
|
1543
|
-
self._timeline_summary['num_of_ops'] += len(framework_obj_list)
|
|
1544
|
-
self._add_framework_info(framework_obj_list)
|
|
1545
|
-
logger.info('Finished adding info into timeline...')
|
|
1546
|
-
|
|
1547
|
-
# Update timeline summary info
|
|
1548
|
-
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
|
|
1549
|
-
|
|
1550
|
-
def init_pynative_timeline(self):
|
|
1551
|
-
"""Init timeline for pynative model."""
|
|
1552
|
-
timeline_list = OPIntermediateParser(self._profiling_dir, self._rank_id).get_timeline_data()
|
|
1553
|
-
cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._model)
|
|
1554
|
-
cpu_timeline_list = cpu_timeline_generator.load_cpu_op_data()
|
|
1555
|
-
if cpu_timeline_list:
|
|
1556
|
-
self._pynative_clock_synchronize(cpu_timeline_list)
|
|
1557
|
-
timeline_list.extend(cpu_timeline_list)
|
|
1558
|
-
|
|
1559
|
-
self._register_op_name(timeline_list)
|
|
1560
|
-
self._timeline_summary['op_exe_times'] = len(timeline_list)
|
|
1561
|
-
self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
|
|
1562
|
-
self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
|
|
1563
|
-
self._timeline_summary['num_of_ops'] = len(self._op_name_list)
|
|
1564
|
-
|
|
1565
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1566
|
-
min_cycle_counter = float(timeline_list[0][self._start_time_idx])
|
|
1567
|
-
|
|
1568
|
-
step_timeline = self._pynative_get_step_timeline_list(timeline_list)
|
|
1569
|
-
timeline_list.extend(step_timeline)
|
|
1570
|
-
|
|
1571
|
-
stream_count_dict = {}
|
|
1572
|
-
max_scope_name_num = 0
|
|
1573
|
-
for timeline in timeline_list:
|
|
1574
|
-
self._parse_timeline_data(timeline, min_cycle_counter)
|
|
1575
|
-
self._update_num_of_streams(timeline, stream_count_dict)
|
|
1576
|
-
cur_scope_name_num = len(timeline[self._op_name_idx].split('/')) - 1
|
|
1577
|
-
max_scope_name_num = max(cur_scope_name_num, max_scope_name_num)
|
|
1578
|
-
|
|
1579
|
-
self._timeline_summary['max_scope_name_num'] = max_scope_name_num
|
|
1580
|
-
self._timeline_summary['num_of_streams'] = len(stream_count_dict)
|
|
1581
|
-
|
|
1582
|
-
def _parse_timeline_data(self, timeline, min_cycle_counter):
|
|
1583
|
-
"""Parse timeline data."""
|
|
1584
|
-
# factor to convert the time unit from 1ms to 1us for timeline display
|
|
1585
|
-
factor = 1000
|
|
1586
|
-
op_meta = TimelineContainer(timeline)
|
|
1587
|
-
timeline_dict = {}
|
|
1588
|
-
timeline_dict['name'] = op_meta.op_name.split('/')[-1]
|
|
1589
|
-
timeline_dict['ph'] = 'X'
|
|
1590
|
-
timeline_dict['tid'] = op_meta.stream_id
|
|
1591
|
-
timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor
|
|
1592
|
-
dur = op_meta.duration * factor
|
|
1593
|
-
timeline_dict['dur'] = dur
|
|
1594
|
-
if op_meta.pid is None:
|
|
1595
|
-
timeline_dict['pid'] = int(self._device_id)
|
|
1596
|
-
# Update total time of operator execution.
|
|
1597
|
-
if op_meta.stream_id not in ["Steps", "Scope Name"]:
|
|
1598
|
-
self._timeline_summary['total_time'] += op_meta.duration
|
|
1599
|
-
else: # AllReduce and AI CPU pid
|
|
1600
|
-
timeline_dict['pid'] = op_meta.pid
|
|
1601
|
-
|
|
1602
|
-
if op_meta.stream_id == "Scope Name":
|
|
1603
|
-
# remove the level of scope name which has a format like "0-conv2-Conv2d".
|
|
1604
|
-
timeline_dict['name'] = "-".join(op_meta.op_name.split('-')[1:])
|
|
1605
|
-
timeline_dict['scope_level'] = int(op_meta.op_name.split('-')[0])
|
|
1606
|
-
elif op_meta.stream_id[:len(self._host_cpu_op_label)] == self._host_cpu_op_label:
|
|
1607
|
-
timeline_dict['pid'] = self._HOST_CPU_PID
|
|
1608
|
-
|
|
1609
|
-
self._update_format_meta_data(timeline_dict)
|
|
1610
|
-
self._timeline_meta.append(timeline_dict)
|
|
1611
|
-
|
|
1612
|
-
def _get_op_timeline(self, communication_info, source_path):
|
|
1613
|
-
"""get ai_core and cpu timeline."""
|
|
1614
|
-
all_reduce_names = AscendTimelineGenerator._get_all_reduce_names(communication_info)
|
|
1615
|
-
timeline_list = OPIntermediateParser(self._profiling_dir, self._rank_id).get_timeline_data(all_reduce_names)
|
|
1616
|
-
for timeline in timeline_list:
|
|
1617
|
-
timeline[self._tid_idx] = f"Stream #{timeline[self._tid_idx]}"
|
|
1618
|
-
|
|
1619
|
-
cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._model)
|
|
1620
|
-
cpu_timeline_list = cpu_timeline_generator.get_timeline_data()
|
|
1621
|
-
if cpu_timeline_list:
|
|
1622
|
-
self._clock_synchronize_to_device(cpu_timeline_list, source_path)
|
|
1623
|
-
timeline_list.extend(cpu_timeline_list)
|
|
1624
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1625
|
-
self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
|
|
1626
|
-
self._timeline_summary['op_exe_times'] = len(timeline_list)
|
|
1627
|
-
self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
|
|
1628
|
-
return timeline_list
|
|
1629
|
-
|
|
1630
|
-
def _clock_synchronize_to_device(self, timeline_list, source_path):
|
|
1631
|
-
"""Synchronize the timestamp from host to device."""
|
|
1632
|
-
host_start_file_path = os.path.join(source_path, f"host_start.log.{self._device_id}")
|
|
1633
|
-
dev_start_file_path = os.path.join(source_path, f"dev_start.log.{self._device_id}")
|
|
1634
|
-
|
|
1635
|
-
try:
|
|
1636
|
-
with open(host_start_file_path) as f_obj:
|
|
1637
|
-
lines = f_obj.readlines()
|
|
1638
|
-
# lines[2] stores host monotonic_raw time of start training.
|
|
1639
|
-
host_monotonic = int(lines[2].strip().split(':')[1])
|
|
1640
|
-
except (IOError, OSError) as err:
|
|
1641
|
-
logger.critical('Error occurred when read host_start.log: %s', err)
|
|
1642
|
-
raise ProfilerIOException() from err
|
|
1643
|
-
try:
|
|
1644
|
-
with open(dev_start_file_path) as f_obj:
|
|
1645
|
-
lines = f_obj.readlines()
|
|
1646
|
-
# lines[2] stores device cycle counter of start training.
|
|
1647
|
-
dev_cntvct = int(lines[2].strip().split(':')[1])
|
|
1648
|
-
except (IOError, OSError) as err:
|
|
1649
|
-
logger.critical('Error occurred when read dev_start.log: %s', err)
|
|
1650
|
-
raise ProfilerIOException() from err
|
|
1651
|
-
|
|
1652
|
-
factor_ns_to_ms = 1e-6
|
|
1653
|
-
factor_ten_ns_to_ns = 10
|
|
1654
|
-
factor_ms_to_ns = 1e6
|
|
1655
|
-
for idx, time_item in enumerate(timeline_list):
|
|
1656
|
-
host_time = int(float(time_item[self._start_time_idx]) * factor_ms_to_ns)
|
|
1657
|
-
device_time = dev_cntvct * factor_ten_ns_to_ns + (host_time - host_monotonic)
|
|
1658
|
-
timeline_list[idx][self._start_time_idx] = device_time * factor_ns_to_ms
|
|
1659
|
-
|
|
1660
|
-
def _add_framework_info(self, framework_obj_list):
|
|
1661
|
-
"""
|
|
1662
|
-
Add framework info into timeline metadata.
|
|
1663
|
-
|
|
1664
|
-
Args:
|
|
1665
|
-
framework_obj_list (list): The framework metadata.
|
|
1666
|
-
"""
|
|
1667
|
-
logger.debug('Start adding framework info into timeline...')
|
|
1668
|
-
# Get the framework info that will be written into timeline.
|
|
1669
|
-
framework_info_dict = {}
|
|
1670
|
-
for framework_obj in framework_obj_list:
|
|
1671
|
-
op_name = framework_obj[0]
|
|
1672
|
-
op_type = framework_obj[1]
|
|
1673
|
-
op_full_name = framework_obj[4]
|
|
1674
|
-
op_info = framework_obj[5]
|
|
1675
|
-
framework_info = {
|
|
1676
|
-
'name': op_name,
|
|
1677
|
-
'args': {
|
|
1678
|
-
'type': op_type,
|
|
1679
|
-
'fullname': op_full_name
|
|
1680
|
-
}
|
|
1681
|
-
}
|
|
1682
|
-
framework_info.get('args').update(op_info)
|
|
1683
|
-
framework_info_dict[op_full_name] = framework_info
|
|
1684
|
-
|
|
1685
|
-
# Insert framework info into timeline.
|
|
1686
|
-
for timeline_item in self._timeline_meta:
|
|
1687
|
-
op_full_name = timeline_item.get('name')
|
|
1688
|
-
framework_item = framework_info_dict.get(op_full_name)
|
|
1689
|
-
if framework_item:
|
|
1690
|
-
timeline_item['name'] = framework_item.get('name')
|
|
1691
|
-
timeline_item['args'] = framework_item.get('args')
|
|
1692
|
-
logger.debug('Finished adding framework info into timeline...')
|
|
1693
|
-
|
|
1694
|
-
def _produce_two_separated_timeline(self, timeline, op_name):
|
|
1695
|
-
"""Produce two separated timeline based on op_name."""
|
|
1696
|
-
timeline_include_op_name = []
|
|
1697
|
-
timeline_exclude_op_name = []
|
|
1698
|
-
for time_item in timeline:
|
|
1699
|
-
if op_name in time_item[self._op_name_idx]:
|
|
1700
|
-
timeline_include_op_name.append(time_item)
|
|
1701
|
-
else:
|
|
1702
|
-
timeline_exclude_op_name.append(time_item)
|
|
1703
|
-
return timeline_include_op_name, timeline_exclude_op_name
|
|
1704
|
-
|
|
1705
|
-
def _get_cluster_timeline(self, aicore_info, comm_info, step_info):
|
|
1706
|
-
"""
|
|
1707
|
-
Analyse the cluster communication and computation data, and write result to file.
|
|
1708
|
-
|
|
1709
|
-
To analyse the cluster performance bottleneck based on timeline, define the time of a training
|
|
1710
|
-
step as "t_total", propose five metrics as follows:
|
|
1711
|
-
1) The time that "receive" operators not overlapped by others(t1)
|
|
1712
|
-
2) The time that is consumed inside the stage(t_total - t1)
|
|
1713
|
-
3) The time that "communication" operators not overlapped by others(t2)
|
|
1714
|
-
4) The time that consumed by computation(t_total - t2)
|
|
1715
|
-
5) The time that "collective communication" operators not overlapped by others(t3)
|
|
1716
|
-
In pipeline parallel mode, we can locate slow stage based on t_total - t1. Inside each stage,
|
|
1717
|
-
we can locate slow card based on t_total - t2. The value of t1 indicates the degree that
|
|
1718
|
-
communication time between stages slow down the training. The value of t3 indicates the degree
|
|
1719
|
-
that communication inside each stage slow down the training.
|
|
1720
|
-
"""
|
|
1721
|
-
is_pipeline_parallel = False
|
|
1722
|
-
comm_merged_timeline, _, comm_display_timeline = self._get_merged_time_list(
|
|
1723
|
-
comm_info, display_name="communication"
|
|
1724
|
-
)
|
|
1725
|
-
aicore_timeline_interval, _, aicore_display_timeline = self._get_merged_time_list(
|
|
1726
|
-
aicore_info, get_interval_time=True
|
|
1727
|
-
)
|
|
1728
|
-
# Consider if the overlap will be 0 or not.
|
|
1729
|
-
comm_not_overlapped_timeline = self._get_intersection_time(
|
|
1730
|
-
aicore_timeline_interval, comm_merged_timeline
|
|
1731
|
-
)
|
|
1732
|
-
|
|
1733
|
-
# Process receive part.
|
|
1734
|
-
all_timeline = aicore_info + comm_info
|
|
1735
|
-
all_timeline.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
1736
|
-
receive_op_timeline, timeline_exclude_receive_op = self._produce_two_separated_timeline(
|
|
1737
|
-
all_timeline, "Receive-op"
|
|
1738
|
-
)
|
|
1739
|
-
if receive_op_timeline:
|
|
1740
|
-
is_pipeline_parallel = True
|
|
1741
|
-
receive_op_merged_timeline = self._get_merged_time_list(receive_op_timeline)[0]
|
|
1742
|
-
timeline_exclude_receive_op_interval = self._get_merged_time_list(
|
|
1743
|
-
timeline_exclude_receive_op, get_interval_time=True
|
|
1744
|
-
)[0]
|
|
1745
|
-
receive_op_not_overlapped_timeline = self._get_intersection_time(
|
|
1746
|
-
timeline_exclude_receive_op_interval, receive_op_merged_timeline
|
|
1747
|
-
)
|
|
1748
|
-
|
|
1749
|
-
# Process collective communication part.
|
|
1750
|
-
collective_comm_timeline = self._produce_two_separated_timeline(
|
|
1751
|
-
comm_info, "Receive-op"
|
|
1752
|
-
)[-1]
|
|
1753
|
-
collective_comm_merged_timeline = self._get_merged_time_list(collective_comm_timeline)[0]
|
|
1754
|
-
collective_comm_not_overlapped_timeline = self._get_intersection_time(
|
|
1755
|
-
aicore_timeline_interval, collective_comm_merged_timeline
|
|
1756
|
-
)
|
|
1757
|
-
|
|
1758
|
-
# Generate free time that exclude computation and communication time.
|
|
1759
|
-
free_timeline = self._get_merged_time_list(
|
|
1760
|
-
all_timeline, get_interval_time=True, display_name="free_time"
|
|
1761
|
-
)[1]
|
|
1762
|
-
|
|
1763
|
-
self._parse_cluster_metrices(step_info, receive_op_not_overlapped_timeline, comm_not_overlapped_timeline
|
|
1764
|
-
, collective_comm_not_overlapped_timeline, is_pipeline_parallel)
|
|
1765
|
-
|
|
1766
|
-
res_timeline = []
|
|
1767
|
-
res_timeline.extend(comm_not_overlapped_timeline)
|
|
1768
|
-
res_timeline.extend(aicore_display_timeline)
|
|
1769
|
-
res_timeline.extend(comm_display_timeline)
|
|
1770
|
-
res_timeline.extend(free_timeline)
|
|
1771
|
-
|
|
1772
|
-
return res_timeline
|
|
1773
|
-
|
|
1774
|
-
def _parse_cluster_metrices(self, step_info, receive_op_not_overlapped_timeline, comm_not_overlapped_timeline
|
|
1775
|
-
, collective_comm_not_overlapped_timeline, is_pipeline_parallel):
|
|
1776
|
-
"""Write the cluster metrices"""
|
|
1777
|
-
step_num = len(step_info)
|
|
1778
|
-
# Compute these five metrics mentioned above per step.
|
|
1779
|
-
recieve_alone_time = self._compute_time_inside_step(receive_op_not_overlapped_timeline, step_info)
|
|
1780
|
-
stage_time, computation_time = [], []
|
|
1781
|
-
comm_alone_time = self._compute_time_inside_step(comm_not_overlapped_timeline, step_info)
|
|
1782
|
-
collective_comm_alone_time = self._compute_time_inside_step(
|
|
1783
|
-
collective_comm_not_overlapped_timeline, step_info
|
|
1784
|
-
)
|
|
1785
|
-
for step in range(step_num):
|
|
1786
|
-
try:
|
|
1787
|
-
if is_pipeline_parallel:
|
|
1788
|
-
stage_time.append(step_info[step][self._duration_idx] - recieve_alone_time[step])
|
|
1789
|
-
computation_time.append(step_info[step][self._duration_idx] - comm_alone_time[step])
|
|
1790
|
-
except IndexError as err:
|
|
1791
|
-
logger.error(err)
|
|
1792
|
-
metrices_per_step_list = [computation_time, comm_alone_time, stage_time,
|
|
1793
|
-
recieve_alone_time, collective_comm_alone_time]
|
|
1794
|
-
if step_num > 1:
|
|
1795
|
-
for metric in metrices_per_step_list:
|
|
1796
|
-
metric.append(sum(metric[1:]) / (step_num - 1))
|
|
1797
|
-
self._write_cluster_metrices(metrices_per_step_list, is_pipeline_parallel, "Ascend", self._rank_id)
|
|
1798
|
-
|
|
1799
|
-
def _compute_time_inside_step(self, metric_timeline, step_time_list):
|
|
1800
|
-
"""Compute per step time of metric_timeline."""
|
|
1801
|
-
per_step_time_list = [0 for i in range(len(step_time_list))]
|
|
1802
|
-
step = 0
|
|
1803
|
-
step_end_time = step_time_list[step][self._start_time_idx] + \
|
|
1804
|
-
step_time_list[step][self._duration_idx]
|
|
1805
|
-
for time_item in metric_timeline:
|
|
1806
|
-
start_time = time_item[self._start_time_idx]
|
|
1807
|
-
if start_time > step_end_time:
|
|
1808
|
-
step += 1
|
|
1809
|
-
if step >= len(step_time_list):
|
|
1810
|
-
logger.warning("Compute profiler compute_time_inside_step time, "
|
|
1811
|
-
"find the data length is more than step count, "
|
|
1812
|
-
"maybe current graph has multi sub graph, skip the last data.")
|
|
1813
|
-
break
|
|
1814
|
-
step_end_time = step_time_list[step][self._start_time_idx] + \
|
|
1815
|
-
step_time_list[step][self._duration_idx]
|
|
1816
|
-
per_step_time_list[step] += time_item[self._duration_idx]
|
|
1817
|
-
|
|
1818
|
-
return per_step_time_list
|
|
1819
|
-
|
|
1820
|
-
def _get_intersection_time(self, first_time_list, second_time_list,
|
|
1821
|
-
display_name="communication_not_overlapped"):
|
|
1822
|
-
"""Get intersection time of two time list."""
|
|
1823
|
-
first_list_idx, second_list_idx = 0, 0
|
|
1824
|
-
first_list_len = len(first_time_list)
|
|
1825
|
-
second_list_len = len(second_time_list)
|
|
1826
|
-
intersection_segment_display_list = []
|
|
1827
|
-
|
|
1828
|
-
while first_list_idx < first_list_len and \
|
|
1829
|
-
second_list_idx < second_list_len:
|
|
1830
|
-
intersection_start = max(
|
|
1831
|
-
first_time_list[first_list_idx][self._start_time_idx],
|
|
1832
|
-
second_time_list[second_list_idx][self._start_time_idx]
|
|
1833
|
-
)
|
|
1834
|
-
intersection_end = min(
|
|
1835
|
-
first_time_list[first_list_idx][self._duration_idx],
|
|
1836
|
-
second_time_list[second_list_idx][self._duration_idx]
|
|
1837
|
-
)
|
|
1838
|
-
if intersection_start < intersection_end:
|
|
1839
|
-
tid = self._tid_dict.get(display_name, [0, 0])
|
|
1840
|
-
intersection_segment_display_list.append(
|
|
1841
|
-
[display_name, tid[0],
|
|
1842
|
-
intersection_start, intersection_end - intersection_start, tid[1]]
|
|
1843
|
-
)
|
|
1844
|
-
if first_time_list[first_list_idx][self._duration_idx] >= \
|
|
1845
|
-
second_time_list[second_list_idx][self._duration_idx]:
|
|
1846
|
-
second_list_idx += 1
|
|
1847
|
-
else:
|
|
1848
|
-
first_list_idx += 1
|
|
1849
|
-
|
|
1850
|
-
return intersection_segment_display_list
|
|
1851
|
-
|
|
1852
|
-
def _pynative_get_step_timeline_list(self, timeline_list):
|
|
1853
|
-
"""Get step timeline list for pynative model."""
|
|
1854
|
-
step_list = []
|
|
1855
|
-
# The timeline starts with the GetNext op
|
|
1856
|
-
if len(timeline_list) < 2 or 'GetNext' not in timeline_list[0][self._op_name_idx] and \
|
|
1857
|
-
'GetNext' not in timeline_list[1][self._op_name_idx]:
|
|
1858
|
-
return step_list
|
|
1859
|
-
step = [-1, -1]
|
|
1860
|
-
step_num = 0
|
|
1861
|
-
tid = "Steps"
|
|
1862
|
-
for timeline in timeline_list:
|
|
1863
|
-
if 'GetNext' not in timeline[self._op_name_idx]:
|
|
1864
|
-
continue
|
|
1865
|
-
start_time = float(timeline[self._start_time_idx])
|
|
1866
|
-
if step[0] == -1:
|
|
1867
|
-
step[0] = start_time
|
|
1868
|
-
else:
|
|
1869
|
-
step[1] = start_time - step[0]
|
|
1870
|
-
step_num = step_num + 1
|
|
1871
|
-
step_list.append([str(step_num), tid, float(step[0]), step[1]])
|
|
1872
|
-
step = [start_time, -1]
|
|
1873
|
-
if step[0] != -1 and step[1] == -1:
|
|
1874
|
-
step_num = step_num + 1
|
|
1875
|
-
step_list.append([str(step_num), tid, float(step[0]),
|
|
1876
|
-
float(timeline_list[-1][self._start_time_idx]) - step[0]])
|
|
1877
|
-
return step_list
|
|
1878
|
-
|
|
1879
|
-
def _pynative_clock_synchronize(self, timeline_list):
|
|
1880
|
-
"""Synchronize the timestamp from device to host."""
|
|
1881
|
-
start_time_file_path = os.path.join(self._profiling_dir, f"start_time_{self._rank_id}.txt")
|
|
1882
|
-
try:
|
|
1883
|
-
with open(start_time_file_path) as f_obj:
|
|
1884
|
-
lines = f_obj.readlines()
|
|
1885
|
-
# lines[0] stores the host monotonic time of start training.
|
|
1886
|
-
host_monotonic_start_time = int(lines[0].strip().split(':')[-1])
|
|
1887
|
-
# lines[1] stores the gpu time of start training.
|
|
1888
|
-
gpu_start_time = int(lines[1].strip().split(':')[-1])
|
|
1889
|
-
except (IOError, OSError) as err:
|
|
1890
|
-
logger.critical(f'Error occurred when read {start_time_file_path}: {err}')
|
|
1891
|
-
raise ProfilerIOException() from err
|
|
1892
|
-
time_diff = gpu_start_time * 1000 - host_monotonic_start_time
|
|
1893
|
-
for idx, time_item in enumerate(timeline_list):
|
|
1894
|
-
timeline_list[idx][self._start_time_idx] = int(time_item[self._start_time_idx]) + time_diff
|
|
1895
|
-
timeline_list[idx][self._start_time_idx] = timeline_list[idx][self._start_time_idx] / 1000000
|
|
1896
|
-
timeline_list[idx][self._duration_idx] = timeline_list[idx][self._duration_idx] / 1000
|
|
1897
|
-
|
|
1898
|
-
def _set_step_start_and_end_op_name(self, timeline_list):
|
|
1899
|
-
"""Set the start and end operator full name of each step."""
|
|
1900
|
-
if not timeline_list or len(timeline_list) < 2:
|
|
1901
|
-
return
|
|
1902
|
-
|
|
1903
|
-
start_op_idx = 0
|
|
1904
|
-
self._step_end_op_name = timeline_list[-1][self._op_name_idx]
|
|
1905
|
-
for i, timeline in enumerate(timeline_list):
|
|
1906
|
-
if timeline[self._op_name_idx] == self._step_end_op_name:
|
|
1907
|
-
start_op_idx = i + 1
|
|
1908
|
-
break
|
|
1909
|
-
|
|
1910
|
-
if start_op_idx >= len(timeline_list):
|
|
1911
|
-
start_op_idx = 0
|
|
1912
|
-
self._step_start_op_name = timeline_list[start_op_idx][self._op_name_idx]
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
class CpuTimelineGenerator(GpuTimelineGenerator):
|
|
1916
|
-
"""Generate cpu Timeline data from file."""
|
|
1917
|
-
_output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
|
|
1918
|
-
_display_filename = 'cpu_timeline_display_{}.json'
|
|
1919
|
-
_timeline_summary_filename = 'cpu_timeline_summary_{}.json'
|
|
1920
|
-
|
|
1921
|
-
def __init__(self, profiling_dir, model):
|
|
1922
|
-
super().__init__(profiling_dir, 0, 0, model)
|
|
1923
|
-
self._device_target = DeviceTarget.CPU.value
|
|
1924
|
-
|
|
1925
|
-
def get_timeline_data(self):
|
|
1926
|
-
"""Get timeline data from file."""
|
|
1927
|
-
timeline_list = self.load_cpu_op_data()
|
|
1928
|
-
factor_ns_to_ms = 1e6
|
|
1929
|
-
factor_us_to_ms = 1e3
|
|
1930
|
-
for time_item in timeline_list:
|
|
1931
|
-
time_item[self._start_time_idx] = float(time_item[self._start_time_idx]) / factor_ns_to_ms
|
|
1932
|
-
time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms
|
|
1933
|
-
|
|
1934
|
-
return timeline_list
|
|
1935
|
-
|
|
1936
|
-
def init_timeline(self):
|
|
1937
|
-
"""Init timeline metadata, adding all collected info."""
|
|
1938
|
-
timeline_list = self._load_timeline_data()
|
|
1939
|
-
|
|
1940
|
-
# Init a dict for counting the num of streams.
|
|
1941
|
-
stream_count_dict = {}
|
|
1942
|
-
for timeline in timeline_list:
|
|
1943
|
-
self._parse_timeline_data(timeline, 0)
|
|
1944
|
-
# Updating the collection of streams.
|
|
1945
|
-
if len(timeline) == 4:
|
|
1946
|
-
self._update_num_of_streams(timeline, stream_count_dict)
|
|
1947
|
-
|
|
1948
|
-
# Add format thread meta data.
|
|
1949
|
-
self._format_meta_data_list.extend(self._timeline_meta)
|
|
1950
|
-
self._timeline_meta = self._format_meta_data_list
|
|
1951
|
-
|
|
1952
|
-
# Update timeline summary info
|
|
1953
|
-
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
|
|
1954
|
-
|
|
1955
|
-
def load_cpu_op_data(self):
|
|
1956
|
-
"""Load cpu operator data from file"""
|
|
1957
|
-
op_file_path = self._get_and_validate_path(
|
|
1958
|
-
self._output_op_execute_time_file_path)
|
|
1959
|
-
timeline_list = []
|
|
1960
|
-
if not os.path.exists(op_file_path):
|
|
1961
|
-
logger.info("No cpu operator info.")
|
|
1962
|
-
return timeline_list
|
|
1963
|
-
timeline_list = self._load_op_data(op_file_path)
|
|
1964
|
-
factor_ms_to_us = 1e-3
|
|
1965
|
-
for time_item in timeline_list:
|
|
1966
|
-
time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_ms_to_us
|
|
1967
|
-
|
|
1968
|
-
return timeline_list
|
|
1969
|
-
|
|
1970
|
-
def _get_and_validate_path(self, file_name):
|
|
1971
|
-
"""Generate op or activity file path from file name, and validate this path."""
|
|
1972
|
-
file_path = os.path.join(
|
|
1973
|
-
self._profiling_dir,
|
|
1974
|
-
file_name.format(self._device_id)
|
|
1975
|
-
)
|
|
1976
|
-
file_path = validate_and_normalize_path(file_path)
|
|
1977
|
-
|
|
1978
|
-
return file_path
|
|
1979
|
-
|
|
1980
|
-
def _load_op_data(self, op_file_path):
|
|
1981
|
-
"""Load operator data from file"""
|
|
1982
|
-
op_timeline_list = []
|
|
1983
|
-
try:
|
|
1984
|
-
with open(op_file_path, 'r') as f_obj:
|
|
1985
|
-
for line in f_obj:
|
|
1986
|
-
self._timeline_summary['num_of_ops'] += 1
|
|
1987
|
-
op_list = line.strip('\n').strip().split(';')
|
|
1988
|
-
time_arr = op_list[-1]
|
|
1989
|
-
time_arr = time_arr.split(" ")
|
|
1990
|
-
for time in time_arr:
|
|
1991
|
-
time = time.split(",")
|
|
1992
|
-
if len(time) == 3:
|
|
1993
|
-
# for time value is [start_timestamp, duration, tid]
|
|
1994
|
-
# line_list[1] would be like "HostCpuOps" + str(tid)
|
|
1995
|
-
line_list = op_list[:1] + [op_list[1] + str(time[-1])] + time[:-1]
|
|
1996
|
-
else:
|
|
1997
|
-
# for time value is [start_timestamp, duration]
|
|
1998
|
-
line_list = op_list[:2] + time
|
|
1999
|
-
op_timeline_list.append(line_list)
|
|
2000
|
-
except (IOError, OSError) as err:
|
|
2001
|
-
logger.critical('Error occurred when load operator timeline data intermediate file: %s', err)
|
|
2002
|
-
raise ProfilerIOException() from err
|
|
2003
|
-
|
|
2004
|
-
return op_timeline_list
|
|
2005
|
-
|
|
2006
|
-
def _load_timeline_data(self):
|
|
2007
|
-
"""Load timeline data from file."""
|
|
2008
|
-
timeline_list = self.load_cpu_op_data()
|
|
2009
|
-
|
|
2010
|
-
timeline_list.sort(key=lambda x: float(x[self._start_time_idx]))
|
|
2011
|
-
self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
|
|
2012
|
-
self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
|
|
2013
|
-
|
|
2014
|
-
# Generate step time.
|
|
2015
|
-
factor_start_time_uint_to_duration = 1e-3
|
|
2016
|
-
self._set_step_start_and_end_op_name(timeline_list)
|
|
2017
|
-
|
|
2018
|
-
step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
|
|
2019
|
-
|
|
2020
|
-
# Add merge compute time and free time
|
|
2021
|
-
merge_compute_timeline = self._get_merged_time_list(
|
|
2022
|
-
timeline_list, False, "computation_op", factor_start_time_uint_to_duration)[2]
|
|
2023
|
-
free_time_timeline = self._get_merged_time_list(
|
|
2024
|
-
timeline_list, True, "free_time", factor_start_time_uint_to_duration)[1]
|
|
2025
|
-
|
|
2026
|
-
# Add Scope Name.
|
|
2027
|
-
default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
|
|
2028
|
-
factor_start_time_uint_to_duration)
|
|
2029
|
-
gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
|
|
2030
|
-
factor_start_time_uint_to_duration)
|
|
2031
|
-
recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
|
|
2032
|
-
factor_start_time_uint_to_duration)
|
|
2033
|
-
timeline_list.extend(default_scope_name_time_list)
|
|
2034
|
-
timeline_list.extend(gradient_scope_name_time_list)
|
|
2035
|
-
timeline_list.extend(recompute_scope_name_time_list)
|
|
2036
|
-
timeline_list.extend(step_time_list)
|
|
2037
|
-
|
|
2038
|
-
timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
|
|
2039
|
-
timeline_list.sort(key=lambda x: float(x[2]))
|
|
2040
|
-
timeline_list.extend(merge_compute_timeline)
|
|
2041
|
-
timeline_list.extend(free_time_timeline)
|
|
2042
|
-
|
|
2043
|
-
return timeline_list
|
|
2044
|
-
|
|
2045
|
-
def _parse_timeline_data(self, timeline, min_cycle_counter):
|
|
2046
|
-
"""Parse timeline data."""
|
|
2047
|
-
# factor to convert the time unit of start_time(ts) from 1ns to 1us for timeline display
|
|
2048
|
-
factor = 1000
|
|
2049
|
-
op_meta = TimelineContainer(timeline)
|
|
2050
|
-
timeline_dict = {}
|
|
2051
|
-
timeline_dict['name'] = op_meta.op_name.split('/')[-1]
|
|
2052
|
-
timeline_dict['ph'] = 'X'
|
|
2053
|
-
timeline_dict['tid'] = op_meta.stream_id
|
|
2054
|
-
timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) / factor
|
|
2055
|
-
dur = op_meta.duration
|
|
2056
|
-
timeline_dict['dur'] = dur
|
|
2057
|
-
timeline_dict['pid'] = int(self._device_id)
|
|
2058
|
-
if op_meta.stream_id == "Scope Name":
|
|
2059
|
-
# remove the level of scope name which has a format like "0-conv2-Conv2d".
|
|
2060
|
-
timeline_dict['name'] = "-".join(op_meta.op_name.split('-')[1:])
|
|
2061
|
-
timeline_dict['scope_level'] = int(op_meta.op_name.split('-')[0])
|
|
2062
|
-
elif self._host_cpu_op_label == op_meta.stream_id[:len(self._host_cpu_op_label)]:
|
|
2063
|
-
timeline_dict['pid'] = self._HOST_CPU_PID
|
|
2064
|
-
|
|
2065
|
-
if len(timeline) == 5:
|
|
2066
|
-
# len(timeline) == 5 refers to analyse data.
|
|
2067
|
-
timeline_dict["pid"] = op_meta.pid
|
|
2068
|
-
elif op_meta.stream_id not in ["Scope Name", "Steps"]:
|
|
2069
|
-
# Update total time of operator execution.
|
|
2070
|
-
self._timeline_summary['total_time'] += dur / factor
|
|
2071
|
-
self._timeline_summary['op_exe_times'] += 1
|
|
2072
|
-
|
|
2073
|
-
self._update_format_meta_data(timeline_dict)
|
|
2074
|
-
self._timeline_meta.append(timeline_dict)
|