mindspore 2.4.0__cp311-cp311-manylinux1_x86_64.whl → 2.4.10__cp311-cp311-manylinux1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/_c_dataengine.cpython-311-x86_64-linux-gnu.so +0 -0
- mindspore/_c_expression.cpython-311-x86_64-linux-gnu.so +0 -0
- mindspore/_c_mindrecord.cpython-311-x86_64-linux-gnu.so +0 -0
- mindspore/bin/cache_admin +0 -0
- mindspore/bin/cache_server +0 -0
- mindspore/common/api.py +1 -4
- mindspore/common/file_system.py +2 -0
- mindspore/common/initializer.py +51 -15
- mindspore/common/parameter.py +6 -5
- mindspore/common/tensor.py +15 -49
- mindspore/communication/_comm_helper.py +5 -0
- mindspore/communication/comm_func.py +7 -7
- mindspore/context.py +16 -2
- mindspore/dataset/engine/datasets_standard_format.py +17 -0
- mindspore/dataset/engine/datasets_user_defined.py +27 -1
- mindspore/experimental/llm_boost/__init__.py +2 -2
- mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
- mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
- mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
- mindspore/include/api/context.h +1 -1
- mindspore/include/dataset/constants.h +2 -2
- mindspore/include/mindapi/base/format.h +13 -0
- mindspore/lib/libavcodec.so.59 +0 -0
- mindspore/lib/libavdevice.so.59 +0 -0
- mindspore/lib/libavfilter.so.8 +0 -0
- mindspore/lib/libavformat.so.59 +0 -0
- mindspore/lib/libavutil.so.57 +0 -0
- mindspore/lib/libdnnl.so.2 +0 -0
- mindspore/lib/libicuuc.so.69 +0 -0
- mindspore/lib/libmindspore_backend.so +0 -0
- mindspore/lib/libmindspore_common.so +0 -0
- mindspore/lib/libmindspore_core.so +0 -0
- mindspore/lib/libmindspore_glog.so.0 +0 -0
- mindspore/lib/libmindspore_gpr.so.15 +0 -0
- mindspore/lib/libmindspore_grpc++.so.1 +0 -0
- mindspore/lib/libmindspore_grpc.so.15 +0 -0
- mindspore/lib/libmindspore_ops.so +0 -0
- mindspore/lib/libopencv_core.so.4.5 +0 -0
- mindspore/lib/libopencv_imgcodecs.so.4.5 +0 -0
- mindspore/lib/libopencv_imgproc.so.4.5 +0 -0
- mindspore/lib/libswresample.so.4 +0 -0
- mindspore/lib/libswscale.so.6 +0 -0
- mindspore/lib/plugin/ascend/custom_aicore_ops/op_proto/libop_proto.so +0 -0
- mindspore/lib/plugin/ascend/custom_aicpu_ops/op_impl/cpu/aicpu_kernel/impl/libcust_cpu_kernels.so +0 -0
- mindspore/lib/plugin/ascend/custom_aicpu_ops/op_proto/libcust_op_proto.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/framework/npu_supported_ops.json +10 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_api/lib/libcust_opapi.so +0 -0
- mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910}/op_impl/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json +0 -42
- mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/custom_ascendc_ops_impl → custom_ascendc_910/op_impl/ai_core/tbe/custom_ascendc_910_impl}/dynamic/decoder_kv_cache.py +51 -16
- mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/custom_ascendc_ops_impl → custom_ascendc_910/op_impl/ai_core/tbe/custom_ascendc_910_impl}/dynamic/prompt_kv_cache.py +51 -16
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_0d5520cc587ad44ce634bf3fbcffc272.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_0d5520cc587ad44ce634bf3fbcffc272.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_20390d30b3c4c0d23167ccca6c030c2b.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_20390d30b3c4c0d23167ccca6c030c2b.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_2d151f0b1d2db51faa2968d5b67544e2.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_2d151f0b1d2db51faa2968d5b67544e2.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_561690ec17cc1def3d2fcf68c1b07b56.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_561690ec17cc1def3d2fcf68c1b07b56.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_570f9aaa99e5e773b3dd0a33784363f4.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_570f9aaa99e5e773b3dd0a33784363f4.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_59668a0f0764afb98fda8ab9e84126f1.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_59668a0f0764afb98fda8ab9e84126f1.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_91d9833e4792b70b670e4e2b916abd86.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_91d9833e4792b70b670e4e2b916abd86.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_c74cdc5fef094383401856f8519504af.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/decoder_kv_cache/DecoderKvCache_c74cdc5fef094383401856f8519504af.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_0515c7b1a4cd614449e38c5e9a7e3f8d.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_0515c7b1a4cd614449e38c5e9a7e3f8d.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_09f22d898d6358c91e7c4fc48bac48e7.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_09f22d898d6358c91e7c4fc48bac48e7.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_0cb9a6f894b925250227136e5aab7061.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_0cb9a6f894b925250227136e5aab7061.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_2fa8702ffd7ca85e9e194f62644415d5.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_2fa8702ffd7ca85e9e194f62644415d5.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_570b62f187dfd439b64613d881deedb7.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_570b62f187dfd439b64613d881deedb7.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_585218c11411ff84709b9e725b66c435.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_585218c11411ff84709b9e725b66c435.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_5c9365ccde170b358c5b126d69dae13e.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_5c9365ccde170b358c5b126d69dae13e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_6d97c45b7c43bc16fcff8baa5dacac4e.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/ascend910/prompt_kv_cache/PromptKvCache_6d97c45b7c43bc16fcff8baa5dacac4e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/config/ascend910/binary_info_config.json +302 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/config/ascend910/decoder_kv_cache.json +892 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/kernel/config/ascend910/prompt_kv_cache.json +892 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64/libcust_opmaster_rt2.0.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_impl/ai_core/tbe/op_tiling/liboptiling.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_proto/inc/op_proto.h +33 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/op_proto/lib/linux/x86_64/libcust_opsproto_rt2.0.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910/version.info +1 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/framework/npu_supported_ops.json +14 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_api/include/aclnn_decoder_kv_cache.h +59 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_api/include/aclnn_prompt_kv_cache.h +59 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_api/lib/libcust_opapi.so +0 -0
- mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/custom_ascendc_ops_impl → custom_ascendc_910b/op_impl/ai_core/tbe/custom_ascendc_910b_impl}/dynamic/all_finite.py +51 -16
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/custom_ascendc_910b_impl/dynamic/decoder_kv_cache.cpp +192 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/custom_ascendc_910b_impl/dynamic/decoder_kv_cache.py +215 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/custom_ascendc_910b_impl/dynamic/prompt_kv_cache.cpp +274 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/custom_ascendc_910b_impl/dynamic/prompt_kv_cache.py +215 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/all_finite/AllFinite_52f59e2a65d9b1bb002de35c2819754a.json +80 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/all_finite/AllFinite_52f59e2a65d9b1bb002de35c2819754a.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/all_finite/AllFinite_6b5e50e30256d85838d6ce83514df20f.json +80 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/all_finite/AllFinite_6b5e50e30256d85838d6ce83514df20f.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/all_finite/AllFinite_74e4ac02880d452e3308c94af273562e.json +80 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/all_finite/AllFinite_74e4ac02880d452e3308c94af273562e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_0d5520cc587ad44ce634bf3fbcffc272.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_0d5520cc587ad44ce634bf3fbcffc272.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_20390d30b3c4c0d23167ccca6c030c2b.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_20390d30b3c4c0d23167ccca6c030c2b.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_2d151f0b1d2db51faa2968d5b67544e2.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_2d151f0b1d2db51faa2968d5b67544e2.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_561690ec17cc1def3d2fcf68c1b07b56.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_561690ec17cc1def3d2fcf68c1b07b56.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_570f9aaa99e5e773b3dd0a33784363f4.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_570f9aaa99e5e773b3dd0a33784363f4.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_59668a0f0764afb98fda8ab9e84126f1.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_59668a0f0764afb98fda8ab9e84126f1.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_91d9833e4792b70b670e4e2b916abd86.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_91d9833e4792b70b670e4e2b916abd86.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_c74cdc5fef094383401856f8519504af.json +158 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/decoder_kv_cache/DecoderKvCache_c74cdc5fef094383401856f8519504af.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_0515c7b1a4cd614449e38c5e9a7e3f8d.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_0515c7b1a4cd614449e38c5e9a7e3f8d.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_09f22d898d6358c91e7c4fc48bac48e7.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_09f22d898d6358c91e7c4fc48bac48e7.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_0cb9a6f894b925250227136e5aab7061.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_0cb9a6f894b925250227136e5aab7061.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_2fa8702ffd7ca85e9e194f62644415d5.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_2fa8702ffd7ca85e9e194f62644415d5.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_570b62f187dfd439b64613d881deedb7.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_570b62f187dfd439b64613d881deedb7.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_585218c11411ff84709b9e725b66c435.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_585218c11411ff84709b9e725b66c435.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_5c9365ccde170b358c5b126d69dae13e.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_5c9365ccde170b358c5b126d69dae13e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_6d97c45b7c43bc16fcff8baa5dacac4e.json +167 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend310p/prompt_kv_cache/PromptKvCache_6d97c45b7c43bc16fcff8baa5dacac4e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_52f59e2a65d9b1bb002de35c2819754a.json +78 -0
- mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_f55e0ebaad1f2f572e43677336992fa0.o → custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_52f59e2a65d9b1bb002de35c2819754a.o} +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_6b5e50e30256d85838d6ce83514df20f.json +78 -0
- mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_576ceaeef5870c451cab59af55ea46ad.o → custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_6b5e50e30256d85838d6ce83514df20f.o} +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_74e4ac02880d452e3308c94af273562e.json +78 -0
- mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_86a73ff6e28d734c96bb8d3054f7dd18.o → custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_74e4ac02880d452e3308c94af273562e.o} +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_0d5520cc587ad44ce634bf3fbcffc272.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_0d5520cc587ad44ce634bf3fbcffc272.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_20390d30b3c4c0d23167ccca6c030c2b.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_20390d30b3c4c0d23167ccca6c030c2b.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_2d151f0b1d2db51faa2968d5b67544e2.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_2d151f0b1d2db51faa2968d5b67544e2.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_561690ec17cc1def3d2fcf68c1b07b56.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_561690ec17cc1def3d2fcf68c1b07b56.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_570f9aaa99e5e773b3dd0a33784363f4.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_570f9aaa99e5e773b3dd0a33784363f4.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_59668a0f0764afb98fda8ab9e84126f1.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_59668a0f0764afb98fda8ab9e84126f1.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_91d9833e4792b70b670e4e2b916abd86.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_91d9833e4792b70b670e4e2b916abd86.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_c74cdc5fef094383401856f8519504af.json +156 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/decoder_kv_cache/DecoderKvCache_c74cdc5fef094383401856f8519504af.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_0515c7b1a4cd614449e38c5e9a7e3f8d.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_0515c7b1a4cd614449e38c5e9a7e3f8d.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_09f22d898d6358c91e7c4fc48bac48e7.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_09f22d898d6358c91e7c4fc48bac48e7.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_0cb9a6f894b925250227136e5aab7061.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_0cb9a6f894b925250227136e5aab7061.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_2fa8702ffd7ca85e9e194f62644415d5.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_2fa8702ffd7ca85e9e194f62644415d5.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_570b62f187dfd439b64613d881deedb7.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_570b62f187dfd439b64613d881deedb7.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_585218c11411ff84709b9e725b66c435.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_585218c11411ff84709b9e725b66c435.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_5c9365ccde170b358c5b126d69dae13e.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_5c9365ccde170b358c5b126d69dae13e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_6d97c45b7c43bc16fcff8baa5dacac4e.json +165 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/ascend910b/prompt_kv_cache/PromptKvCache_6d97c45b7c43bc16fcff8baa5dacac4e.o +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend310p/all_finite.json +139 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend310p/binary_info_config.json +361 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend310p/decoder_kv_cache.json +892 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend310p/prompt_kv_cache.json +892 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend910b/all_finite.json +139 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend910b/binary_info_config.json +361 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend910b/decoder_kv_cache.json +892 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/kernel/config/ascend910b/prompt_kv_cache.json +892 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64/libcust_opmaster_rt2.0.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_impl/ai_core/tbe/op_tiling/liboptiling.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/op_proto/lib/linux/x86_64/libcust_opsproto_rt2.0.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_910b/version.info +1 -0
- mindspore/lib/plugin/ascend/custom_compiler/setup.py +1 -1
- mindspore/lib/plugin/ascend/libdvpp_utils.so +0 -0
- mindspore/lib/plugin/ascend/libhccl_plugin.so +0 -0
- mindspore/lib/plugin/ascend/libmindspore_cpu_kernels.so +0 -0
- mindspore/lib/plugin/ascend/libmindspore_internal_kernels.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/bin/PkgInspect +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/bin/op_man +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/host/libasdops_cann_host.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/host/libasdops_host.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/include/asdops/utils/rt/base/types.h +5 -5
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/libasdops.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/libasdops_static.a +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/liblcal.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/asdops/lib/liblcal_static.a +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/include/acme_op.h +1 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/src/ops/host_src/flash_attention_score_op.h +6 -1
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/src/ops/host_src/paged_attention_op.h +6 -1
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/acme/src/ops/host_src/rms_norm_op.h +4 -3
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libadd_rms_norm_quant_acme_impl.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libapply_rotary_pos_emb_310p_impl.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bnsd_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bnsd_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_bf16_bsh_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bnsd_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bnsd_tri_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/flash_attention_score/flash_attention_score_fp16_bsh_full_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_bf16_bnsd_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_bf16_bsh_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_fp16_bnsd_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/internal_kernel/op_kernels/ascend910b/paged_attention/paged_attention_fp16_bsh_mix.o +0 -0
- mindspore/lib/plugin/ascend/ms_kernels_internal/lccl/lib/liblcal.so +0 -0
- mindspore/lib/plugin/gpu/libcuda_ops.so.10 +0 -0
- mindspore/lib/plugin/gpu/libcuda_ops.so.11 +0 -0
- mindspore/lib/plugin/gpu10.1/libnccl.so.2 +0 -0
- mindspore/lib/plugin/gpu11.1/libnccl.so.2 +0 -0
- mindspore/lib/plugin/gpu11.6/libnccl.so.2 +0 -0
- mindspore/lib/plugin/libmindspore_ascend.so.2 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.10.1 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.11.1 +0 -0
- mindspore/lib/plugin/libmindspore_gpu.so.11.6 +0 -0
- mindspore/mint/__init__.py +490 -2
- mindspore/mint/nn/__init__.py +2 -2
- mindspore/mint/optim/adamw.py +6 -14
- mindspore/nn/__init__.py +2 -0
- mindspore/nn/cell.py +16 -4
- mindspore/nn/layer/basic.py +24 -7
- mindspore/nn/layer/conv.py +3 -0
- mindspore/nn/layer/embedding.py +31 -14
- mindspore/nn/layer/pooling.py +8 -10
- mindspore/nn/optim/tft_wrapper.py +12 -15
- mindspore/nn/utils/__init__.py +22 -0
- mindspore/nn/utils/init.py +71 -0
- mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
- mindspore/ops/_grad_experimental/grad_comm_ops.py +45 -8
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +6 -0
- mindspore/ops/auto_generate/gen_extend_func.py +33 -0
- mindspore/ops/auto_generate/gen_ops_def.py +52 -3
- mindspore/ops/auto_generate/gen_ops_prim.py +158 -8
- mindspore/ops/function/array_func.py +2 -0
- mindspore/ops/function/math_func.py +12 -5
- mindspore/ops/function/random_func.py +221 -7
- mindspore/ops/operations/__init__.py +1 -1
- mindspore/ops/operations/array_ops.py +3 -1
- mindspore/ops/operations/comm_ops.py +25 -1
- mindspore/ops/operations/custom_ops.py +6 -4
- mindspore/ops/operations/manually_defined/ops_def.py +8 -10
- mindspore/ops/operations/nn_ops.py +7 -2
- mindspore/parallel/_auto_parallel_context.py +26 -5
- mindspore/parallel/_cell_wrapper.py +24 -3
- mindspore/parallel/_tensor.py +46 -2
- mindspore/parallel/_utils.py +39 -21
- mindspore/parallel/transform_safetensors.py +196 -43
- mindspore/profiler/profiling.py +5 -1
- mindspore/run_check/_check_version.py +20 -9
- mindspore/train/_utils.py +92 -32
- mindspore/train/callback/_checkpoint.py +12 -9
- mindspore/train/callback/_on_request_exit.py +12 -1
- mindspore/train/callback/_tft_register.py +33 -9
- mindspore/train/dataset_helper.py +10 -2
- mindspore/train/model.py +21 -0
- mindspore/train/serialization.py +12 -19
- mindspore/version.py +1 -1
- {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/METADATA +9 -7
- {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/RECORD +285 -158
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_api/lib/libcust_opapi.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_576ceaeef5870c451cab59af55ea46ad.json +0 -58
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_86a73ff6e28d734c96bb8d3054f7dd18.json +0 -58
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/ascend910b/all_finite/AllFinite_f55e0ebaad1f2f572e43677336992fa0.json +0 -58
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/config/ascend910b/all_finite.json +0 -109
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/kernel/config/ascend910b/binary_info_config.json +0 -38
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64/libcust_opmaster_rt2.0.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_impl/ai_core/tbe/op_tiling/liboptiling.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_ops/op_proto/lib/linux/x86_64/libcust_opsproto_rt2.0.so +0 -0
- mindspore/lib/plugin/ascend/custom_ascendc_ops/version.info +0 -1
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910}/op_api/include/aclnn_decoder_kv_cache.h +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910}/op_api/include/aclnn_prompt_kv_cache.h +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/custom_ascendc_ops_impl → custom_ascendc_910/op_impl/ai_core/tbe/custom_ascendc_910_impl}/dynamic/decoder_kv_cache.cpp +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/custom_ascendc_ops_impl → custom_ascendc_910/op_impl/ai_core/tbe/custom_ascendc_910_impl}/dynamic/prompt_kv_cache.cpp +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910b}/op_api/include/aclnn_all_finite.h +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910b}/op_impl/ai_core/tbe/config/ascend310p/aic-ascend310p-ops-info.json +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910b}/op_impl/ai_core/tbe/config/ascend910b/aic-ascend910b-ops-info.json +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops/op_impl/ai_core/tbe/custom_ascendc_ops_impl → custom_ascendc_910b/op_impl/ai_core/tbe/custom_ascendc_910b_impl}/dynamic/all_finite.cpp +0 -0
- /mindspore/lib/plugin/ascend/{custom_ascendc_ops → custom_ascendc_910b}/op_proto/inc/op_proto.h +0 -0
- {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/WHEEL +0 -0
- {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
- {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0
mindspore/train/_utils.py
CHANGED
|
@@ -16,6 +16,8 @@
|
|
|
16
16
|
from __future__ import absolute_import
|
|
17
17
|
|
|
18
18
|
import os
|
|
19
|
+
import threading
|
|
20
|
+
from datetime import datetime
|
|
19
21
|
import json
|
|
20
22
|
from collections.abc import Iterable
|
|
21
23
|
|
|
@@ -76,7 +78,14 @@ def _exec_datagraph(exec_dataset, dataset_size, phase='dataset', create_data_inf
|
|
|
76
78
|
queue_name = str("")
|
|
77
79
|
|
|
78
80
|
use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
|
|
79
|
-
|
|
81
|
+
|
|
82
|
+
# temp env to disable dynamic feature of sink size 1
|
|
83
|
+
dynamic_sink1_env = os.getenv("MS_DEV_DYNAMIC_SINK1", None)
|
|
84
|
+
dynamic_sink1 = True
|
|
85
|
+
if dynamic_sink1_env and dynamic_sink1_env.strip() in ['False', 'false']:
|
|
86
|
+
dynamic_sink1 = False
|
|
87
|
+
|
|
88
|
+
if use_pipeline_parallel or not dynamic_sink1:
|
|
80
89
|
create_data_info_queue = False
|
|
81
90
|
|
|
82
91
|
exec_dataset = exec_dataset.device_que(send_epoch_end=send_epoch_end,
|
|
@@ -303,10 +312,68 @@ def parse_strategy_ckpt(file_name):
|
|
|
303
312
|
|
|
304
313
|
for ele in param.parallel_layouts.tensor_map[0].ListFields()[0][1]:
|
|
305
314
|
tensor_map.append(ele)
|
|
306
|
-
layout_dict[param.param_name] = [dev_matrix, tensor_map
|
|
315
|
+
layout_dict[param.param_name] = [dev_matrix, tensor_map, param.parallel_layouts.opt_weight_shard_step,
|
|
316
|
+
param.parallel_layouts.opt_weight_shard_size]
|
|
307
317
|
return layout_dict
|
|
308
318
|
|
|
309
319
|
|
|
320
|
+
def _get_strategy_opt_shard(param_redundancy_dict, parameter_layout_opt_shard):
|
|
321
|
+
"""Strategy ckpt append opt shard."""
|
|
322
|
+
for key, value in parameter_layout_opt_shard.items():
|
|
323
|
+
if value[1] not in (-1, 0):
|
|
324
|
+
opt_para_num = value[1]
|
|
325
|
+
param_redundancy_ranks = param_redundancy_dict.get(key)
|
|
326
|
+
res = []
|
|
327
|
+
for param_ranks in param_redundancy_ranks:
|
|
328
|
+
if len(param_ranks) % opt_para_num == 0:
|
|
329
|
+
for i in range(0, opt_para_num):
|
|
330
|
+
res.append(param_ranks[i::opt_para_num])
|
|
331
|
+
param_redundancy_dict[key] = tuple(res)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _get_layout_opt_shard(layout_obj, param_redundancy_dict):
|
|
335
|
+
"""Layout ckpt append opt shard."""
|
|
336
|
+
for key, value in layout_obj.items():
|
|
337
|
+
if value[5]:
|
|
338
|
+
world_groups = ("hccl_world_group", "nccl_world_group", "mccl_world_group")
|
|
339
|
+
if value[5] in world_groups:
|
|
340
|
+
opt_para_num = get_group_size()
|
|
341
|
+
elif "-" in value[5]:
|
|
342
|
+
opt_para_str = value[5].split("-")[0]
|
|
343
|
+
opt_para_num = int(opt_para_str)
|
|
344
|
+
else:
|
|
345
|
+
raise ValueError(f"For get_parameter_redundancy, the format of the parallel communication domain for "
|
|
346
|
+
f"the optimizer is incorrect.")
|
|
347
|
+
param_redundancy_ranks = param_redundancy_dict.get(key)
|
|
348
|
+
res = []
|
|
349
|
+
for param_ranks in param_redundancy_ranks:
|
|
350
|
+
if len(param_ranks) % opt_para_num == 0:
|
|
351
|
+
for i in range(0, opt_para_num):
|
|
352
|
+
res.append(param_ranks[i::opt_para_num])
|
|
353
|
+
param_redundancy_dict[key] = tuple(res)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _get_parameter_redundancy_without_opt_shard(parameter_layout, param_redundancy_dict, initial_rank):
|
|
357
|
+
"""Get parameter redundancy without opt shard."""
|
|
358
|
+
for key, (slices, deploy_loc, *_) in parameter_layout.items():
|
|
359
|
+
redundancy_matrix = np.zeros(shape=slices + [len(slices)], dtype=np.int8)
|
|
360
|
+
for i in deploy_loc:
|
|
361
|
+
internal_slice = tuple(slice(None) for _ in range(i))
|
|
362
|
+
for j in range(slices[-i - 1]):
|
|
363
|
+
if i == -1:
|
|
364
|
+
continue
|
|
365
|
+
else:
|
|
366
|
+
redundancy_matrix[(..., j) + internal_slice + (i,)] = j
|
|
367
|
+
locate_list = redundancy_matrix.reshape((-1, len(slices))).tolist()
|
|
368
|
+
redundancy_dict = {}
|
|
369
|
+
for index, locate in enumerate(locate_list):
|
|
370
|
+
redundancy_dict.setdefault(tuple(locate), []).append(index + initial_rank)
|
|
371
|
+
redundancy_list = []
|
|
372
|
+
for _, indices in sorted(redundancy_dict.items()):
|
|
373
|
+
redundancy_list.append(tuple(indices))
|
|
374
|
+
param_redundancy_dict[key] = tuple(redundancy_list)
|
|
375
|
+
|
|
376
|
+
|
|
310
377
|
def get_parameter_redundancy(layout_obj, initial_rank=0):
|
|
311
378
|
"""
|
|
312
379
|
Get parameter redundancy map.
|
|
@@ -327,7 +394,12 @@ def get_parameter_redundancy(layout_obj, initial_rank=0):
|
|
|
327
394
|
'param4': ((0, 4, 8, 12), (1, 5, 9, 13), (2, 6, 10, 14), (3, 7, 11, 15))}
|
|
328
395
|
"""
|
|
329
396
|
if isinstance(layout_obj, str):
|
|
330
|
-
|
|
397
|
+
parameter_layout_total = parse_strategy_ckpt(layout_obj)
|
|
398
|
+
parameter_layout = {}
|
|
399
|
+
parameter_layout_opt_shard = {}
|
|
400
|
+
for key, value in parameter_layout_total.items():
|
|
401
|
+
parameter_layout[key] = value[0:2]
|
|
402
|
+
parameter_layout_opt_shard[key] = value[2:]
|
|
331
403
|
elif isinstance(layout_obj, Cell):
|
|
332
404
|
from mindspore.communication.management import get_process_group_ranks
|
|
333
405
|
groups_ranks = (tuple(get_process_group_ranks()),)
|
|
@@ -339,37 +411,14 @@ def get_parameter_redundancy(layout_obj, initial_rank=0):
|
|
|
339
411
|
parameter_layout[k] = v[:2]
|
|
340
412
|
|
|
341
413
|
param_redundancy_dict = {}
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
internal_slice = tuple(slice(None) for _ in range(i))
|
|
346
|
-
for j in range(slices[-i - 1]):
|
|
347
|
-
if i == -1:
|
|
348
|
-
continue
|
|
349
|
-
else:
|
|
350
|
-
redundancy_matrix[(..., j) + internal_slice + (i,)] = j
|
|
351
|
-
locate_list = redundancy_matrix.reshape((-1, len(slices))).tolist()
|
|
352
|
-
redundancy_dict = {}
|
|
353
|
-
for index, locate in enumerate(locate_list):
|
|
354
|
-
redundancy_dict.setdefault(tuple(locate), []).append(index + initial_rank)
|
|
355
|
-
redundancy_list = []
|
|
356
|
-
for _, indices in sorted(redundancy_dict.items()):
|
|
357
|
-
redundancy_list.append(tuple(indices))
|
|
358
|
-
param_redundancy_dict[key] = tuple(redundancy_list)
|
|
414
|
+
|
|
415
|
+
_get_parameter_redundancy_without_opt_shard(parameter_layout, param_redundancy_dict, initial_rank)
|
|
416
|
+
|
|
359
417
|
if isinstance(layout_obj, str):
|
|
360
|
-
|
|
418
|
+
_get_strategy_opt_shard(param_redundancy_dict, parameter_layout_opt_shard)
|
|
419
|
+
else:
|
|
420
|
+
_get_layout_opt_shard(layout_obj, param_redundancy_dict)
|
|
361
421
|
|
|
362
|
-
for key, value in layout_obj.items():
|
|
363
|
-
if value[5]:
|
|
364
|
-
world_groups = ("hccl_world_group", "nccl_world_group", "mccl_world_group")
|
|
365
|
-
opt_para_num = int(value[5][0]) if value[5] not in world_groups else get_group_size()
|
|
366
|
-
param_redundancy_ranks = param_redundancy_dict.get(key)
|
|
367
|
-
res = []
|
|
368
|
-
for param_ranks in param_redundancy_ranks:
|
|
369
|
-
if len(param_ranks) % opt_para_num == 0:
|
|
370
|
-
for i in range(0, opt_para_num):
|
|
371
|
-
res.append(param_ranks[i::opt_para_num])
|
|
372
|
-
param_redundancy_dict[key] = tuple(res)
|
|
373
422
|
return param_redundancy_dict
|
|
374
423
|
|
|
375
424
|
|
|
@@ -463,3 +512,14 @@ def parse_hccl_file(hccl_file_path):
|
|
|
463
512
|
rankid_dict[int(device["rank_id"])] = device["device_ip"]
|
|
464
513
|
|
|
465
514
|
return rankid_dict
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def vlog_print(level, module, file, line, message):
|
|
518
|
+
'''Read environment variable VLOG_v and print to log'''
|
|
519
|
+
if os.environ.get("VLOG_v") == level:
|
|
520
|
+
now = datetime.now()
|
|
521
|
+
formatted_time = now.strftime("%Y-%m-%d-%H:%M:%S.%f")[:-3] + f".{now.microsecond // 1000}"
|
|
522
|
+
path = 'mindspore' + file.split("mindspore")[-1]
|
|
523
|
+
pid = os.getpid()
|
|
524
|
+
thread_id = threading.get_ident()
|
|
525
|
+
print(f"[V{level}] {module}({pid},{thread_id},python):{formatted_time} [{path}:{line}] {message}", flush=True)
|
|
@@ -44,6 +44,15 @@ SAVE_DIR = _cur_dir
|
|
|
44
44
|
_info_list = ["epoch_num", "step_num"]
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _wait_async_save_ckpt(async_save=False):
|
|
48
|
+
"""Waiting for asynchronous saving of ckpt to complete."""
|
|
49
|
+
if async_save:
|
|
50
|
+
thread_list = threading.enumerate()
|
|
51
|
+
for thread in thread_list:
|
|
52
|
+
if thread.getName() == "asyn_save_ckpt":
|
|
53
|
+
thread.join()
|
|
54
|
+
|
|
55
|
+
|
|
47
56
|
def _get_dp_tp_from_redundancy(redundancy_tuple):
|
|
48
57
|
"""From redundancy get dp and tp"""
|
|
49
58
|
dp = []
|
|
@@ -568,6 +577,7 @@ class ModelCheckpoint(Callback):
|
|
|
568
577
|
"string that does not contain '/', but got {}.".format(self._prefix))
|
|
569
578
|
if self._directory_func:
|
|
570
579
|
self._directory = self._directory_func(cb_params)
|
|
580
|
+
_make_directory(self._directory)
|
|
571
581
|
collect_host_info("Callback", "ModelCheckpoint", "step_end", start_time=get_clock_syscnt(), level=1)
|
|
572
582
|
# In disaster recovery scenario, the training process may be rolled back to the last step where
|
|
573
583
|
# the ckpt was successfully saved, so the _last_triggered_step should be updated.
|
|
@@ -575,7 +585,6 @@ class ModelCheckpoint(Callback):
|
|
|
575
585
|
self._last_triggered_step = cb_params.last_save_ckpt_step
|
|
576
586
|
cb_params.last_save_ckpt_step = None
|
|
577
587
|
|
|
578
|
-
_make_directory(self._directory)
|
|
579
588
|
# save graph (only once)
|
|
580
589
|
if not self._graph_saved:
|
|
581
590
|
graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta')
|
|
@@ -583,10 +592,6 @@ class ModelCheckpoint(Callback):
|
|
|
583
592
|
os.remove(graph_file_name)
|
|
584
593
|
_save_graph(cb_params.train_network, graph_file_name)
|
|
585
594
|
self._graph_saved = True
|
|
586
|
-
thread_list = threading.enumerate()
|
|
587
|
-
for thread in thread_list:
|
|
588
|
-
if thread.getName() == "asyn_save_ckpt":
|
|
589
|
-
thread.join()
|
|
590
595
|
self._save_ckpt(cb_params)
|
|
591
596
|
|
|
592
597
|
def end(self, run_context):
|
|
@@ -602,10 +607,7 @@ class ModelCheckpoint(Callback):
|
|
|
602
607
|
|
|
603
608
|
self._save_ckpt(cb_params, _to_save_last_ckpt)
|
|
604
609
|
|
|
605
|
-
|
|
606
|
-
for thread in thread_list:
|
|
607
|
-
if thread.getName() == "asyn_save_ckpt":
|
|
608
|
-
thread.join()
|
|
610
|
+
_wait_async_save_ckpt(self._config.async_save)
|
|
609
611
|
|
|
610
612
|
destroy_allgather_cell()
|
|
611
613
|
|
|
@@ -643,6 +645,7 @@ class ModelCheckpoint(Callback):
|
|
|
643
645
|
step_num_in_epoch = int((cb_params.cur_step_num - 1) % cb_params.batch_num + 1)
|
|
644
646
|
|
|
645
647
|
if save_ckpt:
|
|
648
|
+
_wait_async_save_ckpt(self._config.async_save)
|
|
646
649
|
if self._prefix_func:
|
|
647
650
|
cur_ckpoint_file = self._prefix + f".{self._config.format}"
|
|
648
651
|
else:
|
|
@@ -240,7 +240,18 @@ class OnRequestExit(Callback):
|
|
|
240
240
|
if param.name == "graceful_exit" and param.asnumpy() == True: # pylint: disable=C0121
|
|
241
241
|
logger.warning("Graceful exit is triggered, stop training.")
|
|
242
242
|
if self.save_ckpt:
|
|
243
|
-
|
|
243
|
+
append_dict = {"epoch_num": call_params.cur_epoch_num,
|
|
244
|
+
"step_num": call_params.cur_step_num,
|
|
245
|
+
"batch_num": call_params.batch_num}
|
|
246
|
+
if call_params.loss_scale_mananger is not None:
|
|
247
|
+
append_dict["loss_scale"] = call_params.loss_scale_mananger.get_loss_scale()
|
|
248
|
+
if call_params.optimizer is not None:
|
|
249
|
+
global_step = int(call_params.optimizer.global_step.data)
|
|
250
|
+
else:
|
|
251
|
+
global_step = int(call_params.network.optimizer.global_step.data)
|
|
252
|
+
append_dict["global_step"] = global_step
|
|
253
|
+
save_checkpoint(net, self.train_name, integrated_save=self.integrated_save,
|
|
254
|
+
append_dict=append_dict)
|
|
244
255
|
if self.save_mindir:
|
|
245
256
|
inputs = call_params.train_dataset_element
|
|
246
257
|
export(net, *inputs, file_name=self.train_name, file_format='MINDIR')
|
|
@@ -21,6 +21,7 @@ from mindspore import _checkparam as Validator
|
|
|
21
21
|
from mindspore.train.callback._callback import Callback
|
|
22
22
|
from mindspore import context
|
|
23
23
|
from mindspore.common.parameter import Parameter
|
|
24
|
+
from mindspore.common.tensor import Tensor
|
|
24
25
|
from mindspore.communication import get_rank, get_group_size
|
|
25
26
|
from mindspore import log as logger
|
|
26
27
|
from mindspore.train.serialization import _get_cur_rank_dp
|
|
@@ -29,6 +30,9 @@ from mindspore._c_expression import clean_tdt_channel
|
|
|
29
30
|
from mindspore._c_expression import send_recv
|
|
30
31
|
from mindspore._c_expression import CollectiveManager
|
|
31
32
|
from mindspore._c_expression import _get_uce_process_strategy, _get_uce_mem_info
|
|
33
|
+
from mindspore._c_expression import Tensor as Tensor_
|
|
34
|
+
import mindspore
|
|
35
|
+
import mindspore.common.dtype as mstype
|
|
32
36
|
|
|
33
37
|
def _get_ckpt_dir(step, ckpt_save_path, is_tmp_file):
|
|
34
38
|
""" Common func to generate ckpt dir name."""
|
|
@@ -39,6 +43,9 @@ def _get_ckpt_dir(step, ckpt_save_path, is_tmp_file):
|
|
|
39
43
|
def _save_checkpoint_on_failure(step, save_info, args, cb_ctx):
|
|
40
44
|
""" Callback used for TFT save ckpt function when errors occur."""
|
|
41
45
|
logger.info("Enter _save_checkpoint_on_failure function")
|
|
46
|
+
if not cb_ctx._is_params_consistent(): # pylint: disable=W0212
|
|
47
|
+
raise RuntimeError("Can't save parameters, because they are left in inconsistent state!")
|
|
48
|
+
|
|
42
49
|
ckpt_save_path = cb_ctx.ckpt_save_path
|
|
43
50
|
cb_params = args
|
|
44
51
|
cur_rank = get_rank()
|
|
@@ -83,8 +90,6 @@ def _tft_exit_cb(ctx):
|
|
|
83
90
|
_tft_sem_post()
|
|
84
91
|
os._exit(1) # pylint: disable=W0212
|
|
85
92
|
|
|
86
|
-
|
|
87
|
-
|
|
88
93
|
def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_ctx):
|
|
89
94
|
""" Callback used for TFT repair function."""
|
|
90
95
|
logger.info("Enter _tft_repair_callback repair type: {}".format(repair_info["repair_type"]))
|
|
@@ -104,7 +109,7 @@ or repair_info["repair_type"] == cb_ctx.tft.RepairType.RT_UCE_LOWLEVEL.value):
|
|
|
104
109
|
logger.info("Finish _tft_repair_callback")
|
|
105
110
|
|
|
106
111
|
|
|
107
|
-
def _tft_clean_callback(is_uce_error, ctx):
|
|
112
|
+
def _tft_clean_callback(is_uce_error, args, ctx):
|
|
108
113
|
""" Callback used for TFT clean function."""
|
|
109
114
|
logger.info("Enter _tft_clean_callback")
|
|
110
115
|
ret = 0
|
|
@@ -125,10 +130,13 @@ def _tft_clean_callback(is_uce_error, ctx):
|
|
|
125
130
|
return ret
|
|
126
131
|
|
|
127
132
|
|
|
128
|
-
def _tft_stop_callback(cb_ctx):
|
|
133
|
+
def _tft_stop_callback(args, cb_ctx):
|
|
129
134
|
""" Callback used for TFT stop function."""
|
|
130
135
|
logger.info("Enter _tft_stop_callback device_id: {}".format(cb_ctx.device_id))
|
|
131
136
|
_stop_device(cb_ctx.device_id)
|
|
137
|
+
if (not cb_ctx.is_uce_rank) and (not cb_ctx._is_params_consistent()): # pylint: disable=W0212
|
|
138
|
+
raise RuntimeError("Can't stop device, because training parameters are left in inconsistent state!")
|
|
139
|
+
cb_ctx.is_uce_rank = False
|
|
132
140
|
logger.info("Finish _tft_stop_callback")
|
|
133
141
|
|
|
134
142
|
|
|
@@ -254,15 +262,29 @@ class TFTRegister(Callback):
|
|
|
254
262
|
# let it raise errors if not install mindio_tft package
|
|
255
263
|
from mindio_ttp import framework_ttp as tft
|
|
256
264
|
self.tft = tft
|
|
265
|
+
self.is_uce_rank = False
|
|
257
266
|
self.global_step = 0
|
|
258
267
|
Validator.check_non_negative_int(ctrl_port)
|
|
259
268
|
self.has_init_replica = False
|
|
260
269
|
self._controller_ip = ctrl_ip
|
|
261
270
|
self._controller_rank_id = ctrl_rank_id
|
|
262
271
|
self._controller_port = ctrl_port
|
|
272
|
+
self.cb_params = None
|
|
263
273
|
self.device_id = context.get_context("device_id")
|
|
264
274
|
self._init_tft()
|
|
265
275
|
self.ckpt_save_path = ckpt_save_path
|
|
276
|
+
self.assign = mindspore.ops.Assign()
|
|
277
|
+
self.g_one = Parameter(Tensor([1], dtype=mstype.int32))
|
|
278
|
+
self.s1 = mindspore.hal.Stream()
|
|
279
|
+
|
|
280
|
+
def _is_params_consistent(self):
|
|
281
|
+
for key, param in self.cb_params.train_network.parameters_and_names():
|
|
282
|
+
if "tft_g_one_flag" in key:
|
|
283
|
+
with mindspore.hal.StreamCtx(self.s1):
|
|
284
|
+
tft_g_one_flag = Tensor(Tensor_.move_to(param, "CPU", False))
|
|
285
|
+
self.s1.synchronize()
|
|
286
|
+
return int(tft_g_one_flag) == 1
|
|
287
|
+
return False
|
|
266
288
|
|
|
267
289
|
def _set_tft_optimizer_replica(self, run_context):
|
|
268
290
|
""" set Mindio TFT optimizer replica info, used internal. """
|
|
@@ -280,7 +302,7 @@ class TFTRegister(Callback):
|
|
|
280
302
|
replica_info = [
|
|
281
303
|
{
|
|
282
304
|
"type": 1,
|
|
283
|
-
"rank_list": dp,
|
|
305
|
+
"rank_list": list(dp),
|
|
284
306
|
"replica_cnt": len(dp),
|
|
285
307
|
"replica_shift": 0
|
|
286
308
|
}
|
|
@@ -301,13 +323,12 @@ class TFTRegister(Callback):
|
|
|
301
323
|
cur_rank = get_rank()
|
|
302
324
|
enable_local_copy = False
|
|
303
325
|
enable_arf = False
|
|
304
|
-
enable_zit = False
|
|
305
326
|
enable_tls = False
|
|
306
327
|
tls_key_dir = ""
|
|
307
328
|
|
|
308
329
|
if cur_rank == self._controller_rank_id:
|
|
309
330
|
logger.info(f"Begin to start tft controller on rank_id:{cur_rank}")
|
|
310
|
-
self.tft.tft_init_controller(cur_rank, world_size, enable_local_copy, enable_arf
|
|
331
|
+
self.tft.tft_init_controller(cur_rank, world_size, enable_local_copy, enable_arf)
|
|
311
332
|
self.tft.tft_start_controller(self._controller_ip, self._controller_port, enable_tls, tls_key_dir)
|
|
312
333
|
logger.info("Finish start tft controller.")
|
|
313
334
|
|
|
@@ -328,12 +349,14 @@ class TFTRegister(Callback):
|
|
|
328
349
|
self.has_init_replica = True
|
|
329
350
|
self._set_tft_optimizer_replica(run_context)
|
|
330
351
|
cb_params = run_context.original_args()
|
|
352
|
+
logger.info("START Set optimizer finish step status to TFT. step: {}".format(cb_params.cur_step_num))
|
|
353
|
+
self.tft.tft_end_updating_os(cb_params.cur_step_num)
|
|
331
354
|
if cb_params.optimizer is not None:
|
|
332
355
|
self.global_step = int(cb_params.optimizer.global_step.data)
|
|
356
|
+
self.assign(cb_params.optimizer.tft_g_one_flag, self.g_one)
|
|
333
357
|
else:
|
|
334
358
|
self.global_step = int(cb_params.network.optimizer.global_step.data)
|
|
335
|
-
|
|
336
|
-
self.tft.tft_end_updating_os(cb_params.cur_step_num)
|
|
359
|
+
self.assign(cb_params.network.optimizer.tft_g_one_flag, self.g_one)
|
|
337
360
|
logger.info("END Set optimizer finish step status to TFT.")
|
|
338
361
|
|
|
339
362
|
|
|
@@ -344,6 +367,7 @@ class TFTRegister(Callback):
|
|
|
344
367
|
raise ValueError("TFT feature doesn't support sink_size > 1.")
|
|
345
368
|
logger.info("Set set args to TFT.")
|
|
346
369
|
self.tft.tft_set_step_args(cb_params)
|
|
370
|
+
self.cb_params = cb_params
|
|
347
371
|
|
|
348
372
|
def end(self, run_context):
|
|
349
373
|
cur_rank = get_rank()
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
"""Dataset help for minddata dataset"""
|
|
16
16
|
from __future__ import absolute_import
|
|
17
17
|
|
|
18
|
+
import os
|
|
18
19
|
import math
|
|
19
20
|
import copy
|
|
20
21
|
|
|
@@ -264,7 +265,14 @@ def connect_network_with_dataset(network, dataset_helper):
|
|
|
264
265
|
queue_name = dataset.__transfer_dataset__.queue_name
|
|
265
266
|
# In pipeline parallel, some stages have no GetNext, should not get in.
|
|
266
267
|
use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
|
|
267
|
-
|
|
268
|
+
|
|
269
|
+
# temp env to disable dynamic feature of sink size 1
|
|
270
|
+
dynamic_sink1_env = os.getenv("MS_DEV_DYNAMIC_SINK1", None)
|
|
271
|
+
dynamic_sink1 = True
|
|
272
|
+
if dynamic_sink1_env and dynamic_sink1_env.strip() in ['False', 'false']:
|
|
273
|
+
dynamic_sink1 = False
|
|
274
|
+
|
|
275
|
+
if _dynamic_sink_scenario(dataset, dataset_iter, is_dynamic) and not use_pipeline_parallel and dynamic_sink1:
|
|
268
276
|
dataset_types, dataset_shapes = dataset_helper.get_data_info()
|
|
269
277
|
# Need to do full_batch for shapes which also do in the _DatasetIterMSLoopSink
|
|
270
278
|
if _need_to_full():
|
|
@@ -306,7 +314,7 @@ def connect_network_with_dataset(network, dataset_helper):
|
|
|
306
314
|
aux.__shape_type__ = str(dataset_types) + str(dataset_shapes)
|
|
307
315
|
|
|
308
316
|
if _dynamic_sink_data(dataset, dataset_iter) and _dynamic_sink_exception_scenario(dataset_iter, is_dynamic) and \
|
|
309
|
-
not use_pipeline_parallel:
|
|
317
|
+
not use_pipeline_parallel and dynamic_sink1:
|
|
310
318
|
dataset_helper.get_data_info()
|
|
311
319
|
network.add_flags(sink_mode=True)
|
|
312
320
|
return network
|
mindspore/train/model.py
CHANGED
|
@@ -46,6 +46,7 @@ from mindspore.parallel._utils import _get_parallel_mode, _get_device_num, _get_
|
|
|
46
46
|
from mindspore.parallel._ps_context import _is_role_worker, _is_role_pserver, _is_ps_mode, \
|
|
47
47
|
_cache_enable, _enable_distributed_mindrt
|
|
48
48
|
from mindspore.train.metrics import Loss
|
|
49
|
+
from mindspore.train._utils import vlog_print
|
|
49
50
|
from mindspore import nn
|
|
50
51
|
from mindspore.boost import AutoBoost
|
|
51
52
|
from mindspore.context import ParallelMode
|
|
@@ -148,6 +149,7 @@ def _handle_tft(func):
|
|
|
148
149
|
e_str = str(e)
|
|
149
150
|
logger.info("uce wrapper caught RuntimeError e_str:{}".format(e_str))
|
|
150
151
|
if "UCEError" in e_str:
|
|
152
|
+
obj.is_uce_rank = True
|
|
151
153
|
logger.info("uce wrapper report UCEError")
|
|
152
154
|
tft.tft_report_error(tft.ReportState.RS_UCE.value)
|
|
153
155
|
elif "ForceStopError" in e_str:
|
|
@@ -654,10 +656,12 @@ class Model:
|
|
|
654
656
|
dataset.__loop_size__ = 1
|
|
655
657
|
|
|
656
658
|
if dataset_helper is None:
|
|
659
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin to create DatasetHelper.")
|
|
657
660
|
logger.info("Begin to create DatasetHelper.")
|
|
658
661
|
dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num)
|
|
659
662
|
|
|
660
663
|
if dataset_sink_mode:
|
|
664
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin to connect network with dataset.")
|
|
661
665
|
logger.info("Begin to connect network with dataset.")
|
|
662
666
|
network = connect_network_with_dataset(network, dataset_helper)
|
|
663
667
|
|
|
@@ -779,6 +783,7 @@ class Model:
|
|
|
779
783
|
if not train_dataset and not valid_dataset:
|
|
780
784
|
raise ValueError("The argument 'train_dataset' and 'valid_dataset' can not both be None or empty.")
|
|
781
785
|
|
|
786
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin to check device number in model.build().")
|
|
782
787
|
logger.info("Begin to check device number in model.build() procedure.")
|
|
783
788
|
_device_number_check(self._parallel_mode, self._device_number)
|
|
784
789
|
|
|
@@ -787,17 +792,21 @@ class Model:
|
|
|
787
792
|
raise TypeError("The type of 'train_dataset' must be `Dataset`, "
|
|
788
793
|
"but got {}.".format(type(train_dataset)))
|
|
789
794
|
|
|
795
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno,
|
|
796
|
+
"Begin to check parameter broadcast in model.build().")
|
|
790
797
|
logger.info("Begin to check parameter broadcast in model.build() procedure.")
|
|
791
798
|
_parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast)
|
|
792
799
|
if self._parameter_broadcast:
|
|
793
800
|
self._train_network.set_broadcast_flag()
|
|
794
801
|
|
|
802
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin to exec preprocess in model.build().")
|
|
795
803
|
logger.info("Begin to exec preprocess in model.build() procedure.")
|
|
796
804
|
train_dataset.__no_send__ = True
|
|
797
805
|
train_dataset_helper, train_network = self._exec_preprocess(is_train=True,
|
|
798
806
|
dataset=train_dataset,
|
|
799
807
|
dataset_sink_mode=True,
|
|
800
808
|
sink_size=sink_size)
|
|
809
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin to warmup dataset in model.build().")
|
|
801
810
|
logger.info("Begin to warmup dataset in model.build() procedure.")
|
|
802
811
|
self._warmup_dataset(epoch, train_dataset, sink_size)
|
|
803
812
|
|
|
@@ -805,13 +814,19 @@ class Model:
|
|
|
805
814
|
delattr(train_dataset, "__no_send__")
|
|
806
815
|
|
|
807
816
|
# Waiting for the dataset warmup ready
|
|
817
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno,
|
|
818
|
+
"Begin waiting for dataset warmup in model.build().")
|
|
808
819
|
logger.info("Begin waiting for dataset warmup in model.build() procedure.")
|
|
809
820
|
self._waiting_for_dataset_warmup_ready(train_dataset)
|
|
821
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno,
|
|
822
|
+
"The dataset warmup was successful in model.build().")
|
|
810
823
|
logger.info("The dataset warmup was successful in model.build() procedure.")
|
|
811
824
|
|
|
812
825
|
if context.get_auto_parallel_context("pipeline_stages") > 1 and valid_dataset:
|
|
813
826
|
train_network.add_flags_recursive(is_first_iteration=True)
|
|
814
827
|
for inputs in train_dataset_helper:
|
|
828
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno,
|
|
829
|
+
"Begin to compile train network in model.build().")
|
|
815
830
|
logger.info("Begin to compile train network in model.build() procedure.")
|
|
816
831
|
train_network.compile(*inputs)
|
|
817
832
|
self._train_network.parameter_layout_dict = train_network.parameter_layout_dict
|
|
@@ -832,6 +847,8 @@ class Model:
|
|
|
832
847
|
if context.get_auto_parallel_context("pipeline_stages") > 1:
|
|
833
848
|
eval_network.add_flags_recursive(is_first_iteration=False)
|
|
834
849
|
for inputs in valid_dataset_helper:
|
|
850
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno,
|
|
851
|
+
"Begin to compile eval network in model.build().")
|
|
835
852
|
logger.info("Begin to compile eval network in model.build() procedure.")
|
|
836
853
|
eval_network.compile(*inputs)
|
|
837
854
|
break
|
|
@@ -905,6 +922,7 @@ class Model:
|
|
|
905
922
|
epoch = 1
|
|
906
923
|
cb_params.last_save_ckpt_step = None
|
|
907
924
|
cb_params.latest_ckpt_file = None
|
|
925
|
+
cb_params.loss_scale_mananger = self._loss_scale_manager
|
|
908
926
|
|
|
909
927
|
# build callback list
|
|
910
928
|
with _CallbackManager(callbacks) as list_callback:
|
|
@@ -1567,8 +1585,11 @@ class Model:
|
|
|
1567
1585
|
if hasattr(self._train_network, '_is_check_and_refresh') and not self._train_network._is_check_and_refresh:
|
|
1568
1586
|
self._train_network.check_names_and_refresh_name()
|
|
1569
1587
|
self._train_network._is_check_and_refresh = True
|
|
1588
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno, "Begin to init dataset in model.build().")
|
|
1570
1589
|
logger.info("Begin to init dataset in model.build() procedure.")
|
|
1571
1590
|
self._init(train_dataset, valid_dataset, sink_size, epoch)
|
|
1591
|
+
vlog_print("1", "ME", __file__, sys._getframe().f_lineno,
|
|
1592
|
+
"The model.build() which contains dataset warmup and network compile is success.")
|
|
1572
1593
|
logger.info("The model.build() which contains dataset warmup and network compile is success.")
|
|
1573
1594
|
|
|
1574
1595
|
def _eval_in_fit(self, valid_dataset, callbacks=None, dataset_sink_mode=True, cb_params=None):
|
mindspore/train/serialization.py
CHANGED
|
@@ -64,7 +64,7 @@ from mindspore.parallel._cell_wrapper import get_allgather_cell, _single_paramet
|
|
|
64
64
|
from mindspore.parallel._tensor import _load_tensor, _get_tensor_strategy, _get_tensor_slice_index
|
|
65
65
|
from mindspore.parallel._tensor import _reshape_param_data, _reshape_param_data_with_weight
|
|
66
66
|
from mindspore.parallel._utils import _infer_rank_list, _remove_repeated_slices, _is_in_auto_parallel_mode, \
|
|
67
|
-
_get_device_num
|
|
67
|
+
_get_device_num
|
|
68
68
|
from mindspore.parallel._auto_parallel_context import _get_auto_parallel_context
|
|
69
69
|
from mindspore.parallel._parallel_serialization import _convert_to_list, _convert_to_layout, _build_searched_strategy, \
|
|
70
70
|
_restore_group_info_list, _get_param_list_when_first_dim_sharded
|
|
@@ -81,6 +81,7 @@ from safetensors.numpy import save_file
|
|
|
81
81
|
from safetensors import safe_open
|
|
82
82
|
from ..ops.operations._opaque_predicate_registry import add_opaque_predicate, clean_funcs
|
|
83
83
|
|
|
84
|
+
|
|
84
85
|
tensor_to_ms_type = {"Int8": mstype.int8, "UInt8": mstype.uint8, "Int16": mstype.int16, "UInt16": mstype.uint16,
|
|
85
86
|
"Int32": mstype.int32, "UInt32": mstype.uint32, "Int64": mstype.int64, "UInt64": mstype.uint64,
|
|
86
87
|
"Float16": mstype.float16, "Float32": mstype.float32, "Float64": mstype.float64,
|
|
@@ -328,6 +329,8 @@ def _exec_save(ckpt_file_name, data_list, enc_key=None, enc_mode="AES-GCM", map_
|
|
|
328
329
|
file_name_list = list(os.path.splitext(ckpt_file_name))
|
|
329
330
|
file_name_list[1] = file_name_list[1].replace(f".{format}", ".tmp")
|
|
330
331
|
tmp_name = ''.join(file_name_list)
|
|
332
|
+
if _ckpt_fs.backend == "mindio":
|
|
333
|
+
tmp_name = ckpt_file_name
|
|
331
334
|
if os.path.exists(ckpt_file_name):
|
|
332
335
|
os.chmod(ckpt_file_name, stat.S_IWUSR)
|
|
333
336
|
os.remove(ckpt_file_name)
|
|
@@ -383,7 +386,7 @@ def _exec_save(ckpt_file_name, data_list, enc_key=None, enc_mode="AES-GCM", map_
|
|
|
383
386
|
if not os.path.exists(tmp_name):
|
|
384
387
|
logger.warning(f"Rename failed, can't find {tmp_name}, it is possible that multiple processes have "
|
|
385
388
|
f"simultaneously modified a file.")
|
|
386
|
-
|
|
389
|
+
elif _ckpt_fs.backend != "mindio":
|
|
387
390
|
os.rename(tmp_name, ckpt_file_name)
|
|
388
391
|
os.chmod(ckpt_file_name, stat.S_IRUSR)
|
|
389
392
|
except BaseException as e:
|
|
@@ -1569,6 +1572,9 @@ def _parse_ckpt_proto(ckpt_file_name, dec_key, dec_mode, crc_check):
|
|
|
1569
1572
|
raise ValueError("For 'load_checkpoint', the crc check is failed, "
|
|
1570
1573
|
"please check whether the ckpt file is damaged.")
|
|
1571
1574
|
checkpoint_list.ParseFromString(pb_content)
|
|
1575
|
+
except google.protobuf.message.DecodeError as e:
|
|
1576
|
+
raise ValueError(f"Failed to read the checkpoint file {ckpt_file_name}. "
|
|
1577
|
+
f"The file may be corrupted, and the content cannot be parsed.") from e
|
|
1572
1578
|
except BaseException as e:
|
|
1573
1579
|
if _is_cipher_file(ckpt_file_name):
|
|
1574
1580
|
err_info = "Failed to read the checkpoint file {}. The file may be encrypted or tempered with, " \
|
|
@@ -1598,19 +1604,6 @@ def _whether_load_param(specify_prefix, filter_prefix, param_name):
|
|
|
1598
1604
|
return whether_load
|
|
1599
1605
|
|
|
1600
1606
|
|
|
1601
|
-
def _init_parameter_data_in_parallel_mode(net, parameter_dict):
|
|
1602
|
-
"""In parallel mode, only init the paraemters in ckpt."""
|
|
1603
|
-
is_train_phase = net.phase.startswith('train')
|
|
1604
|
-
for _, param in net.parameters_and_names():
|
|
1605
|
-
if param.name in parameter_dict and param.from_ckpt and not is_train_phase:
|
|
1606
|
-
param.shape = tuple(parameter_dict[param.name].shape)
|
|
1607
|
-
continue
|
|
1608
|
-
if param.name in parameter_dict and param.has_init:
|
|
1609
|
-
logger.warning("{} is not init while load ckpt.".format(param.name))
|
|
1610
|
-
new_tensor = param.init_data()
|
|
1611
|
-
param._update_tensor_data(new_tensor)
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
1607
|
def _check_load_param_into_net(net, parameter_dict):
|
|
1615
1608
|
"""check load_param_into_net"""
|
|
1616
1609
|
if not isinstance(net, nn.Cell):
|
|
@@ -1682,10 +1675,6 @@ def load_param_into_net(net, parameter_dict, strict_load=False, remove_redundanc
|
|
|
1682
1675
|
logger.info("Execute the process of loading parameters into net.")
|
|
1683
1676
|
for _, param in net.parameters_and_names():
|
|
1684
1677
|
param.from_ckpt = True
|
|
1685
|
-
if not (_is_in_auto_parallel_mode() or _is_parallel_mode()):
|
|
1686
|
-
net.init_parameters_data()
|
|
1687
|
-
else:
|
|
1688
|
-
_init_parameter_data_in_parallel_mode(net, parameter_dict)
|
|
1689
1678
|
param_not_load = []
|
|
1690
1679
|
ckpt_not_load = list(parameter_dict.keys())
|
|
1691
1680
|
for _, param in net.parameters_and_names():
|
|
@@ -1698,6 +1687,8 @@ def load_param_into_net(net, parameter_dict, strict_load=False, remove_redundanc
|
|
|
1698
1687
|
continue
|
|
1699
1688
|
new_param = parameter_dict[param.name]
|
|
1700
1689
|
_update_param(param, new_param, strict_load)
|
|
1690
|
+
if hasattr(param, "init_param") and not param.init_param:
|
|
1691
|
+
param.init_param = True
|
|
1701
1692
|
ckpt_not_load.remove(param.name)
|
|
1702
1693
|
else:
|
|
1703
1694
|
param_not_load.append(param.name)
|
|
@@ -1822,6 +1813,8 @@ def _load_dismatch_prefix_params(net, parameter_dict, param_not_load, strict_loa
|
|
|
1822
1813
|
if param.name in param_not_load and new_param_name in parameter_dict:
|
|
1823
1814
|
new_param = parameter_dict[new_param_name]
|
|
1824
1815
|
_update_param(param, new_param, strict_load)
|
|
1816
|
+
if hasattr(param, "init_param") and not param.init_param:
|
|
1817
|
+
param.init_param = True
|
|
1825
1818
|
param_not_load.remove(param.name)
|
|
1826
1819
|
|
|
1827
1820
|
|
mindspore/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '2.4.
|
|
1
|
+
__version__ = '2.4.10'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mindspore
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.10
|
|
4
4
|
Summary: MindSpore is a new open source deep learning training/inference framework that could be used for mobile, edge and cloud scenarios.
|
|
5
5
|
Home-page: https://www.mindspore.cn
|
|
6
6
|
Download-URL: https://github.com/mindspore-ai/mindspore/tags
|
|
@@ -118,7 +118,7 @@ MindSpore offers build options across multiple backends:
|
|
|
118
118
|
|
|
119
119
|
| Hardware Platform | Operating System | Status |
|
|
120
120
|
| :---------------- | :--------------- | :----- |
|
|
121
|
-
|
|
|
121
|
+
| Ascend | Ubuntu-x86 | ✔️ |
|
|
122
122
|
| | Ubuntu-aarch64 | ✔️ |
|
|
123
123
|
| | EulerOS-aarch64 | ✔️ |
|
|
124
124
|
| | CentOS-x86 | ✔️ |
|
|
@@ -316,15 +316,17 @@ Project stable branches will be in one of the following states:
|
|
|
316
316
|
| Development | 3 months | Features are under development. |
|
|
317
317
|
| Maintained | 6 - 12 months | All bugfixes are appropriate. Releases produced. |
|
|
318
318
|
| Unmaintained| 0 - 3 months | All bugfixes are appropriate. No Maintainers and No Releases produced. |
|
|
319
|
-
| End Of Life (EOL) | N/A |
|
|
319
|
+
| End Of Life (EOL) | N/A | Version no longer accepting changes. |
|
|
320
320
|
|
|
321
321
|
## Maintenance status
|
|
322
322
|
|
|
323
|
-
| **
|
|
323
|
+
| **Version** | **Status** | **Initial Release Date** | **Next Phase** | **EOL Date**|
|
|
324
324
|
|------------|--------------|--------------------------|----------------------------------------|-------------|
|
|
325
|
-
| **r2.
|
|
326
|
-
| **r2.
|
|
327
|
-
| **r2.
|
|
325
|
+
| **r2.4** | Maintained | 2024-10-30 | Unmaintained <br> 2025-10-30 estimated | 2025-10-30 |
|
|
326
|
+
| **r2.3** | Maintained | 2024-07-15 | Unmaintained <br> 2025-07-15 estimated | 2025-07-15 |
|
|
327
|
+
| **r2.2** | End Of Life | 2023-10-18 | | 2024-10-18 |
|
|
328
|
+
| **r2.1** | End Of Life | 2023-07-29 | | 2024-07-29 |
|
|
329
|
+
| **r2.0** | End Of Life | 2023-06-15 | | 2024-06-15 |
|
|
328
330
|
| **r1.10** | End Of Life | 2023-02-02 | | 2024-02-02 |
|
|
329
331
|
| **r1.9** | End Of Life | 2022-10-26 | | 2023-10-26 |
|
|
330
332
|
| **r1.8** | End Of Life | 2022-07-29 | | 2023-07-29 |
|