PyPI - mindstudio-probe - Versions diffs - 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/METADATA +4 -3
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/RECORD +243 -191
msprobe/README.md +57 -21
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +224 -82
msprobe/core/common/decorator.py +50 -0
msprobe/core/common/exceptions.py +5 -3
msprobe/core/common/file_utils.py +274 -40
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +148 -72
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +640 -462
msprobe/core/compare/check.py +36 -107
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +217 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +4 -1
msprobe/core/compare/merge_result/merge_result.py +12 -6
msprobe/core/compare/multiprocessing_compute.py +227 -107
msprobe/core/compare/npy_compare.py +32 -16
msprobe/core/compare/utils.py +218 -244
msprobe/{mindspore/runtime.py → core/config_check/__init__.py} +2 -4
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{pytorch/parse.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +239 -0
msprobe/core/data_dump/data_collector.py +36 -9
msprobe/core/data_dump/data_processor/base.py +74 -53
msprobe/core/data_dump/data_processor/mindspore_processor.py +119 -78
msprobe/core/data_dump/data_processor/pytorch_processor.py +134 -96
msprobe/core/data_dump/json_writer.py +146 -57
msprobe/core/debugger/precision_debugger.py +143 -0
msprobe/core/grad_probe/constant.py +2 -1
msprobe/core/grad_probe/grad_compare.py +2 -2
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/overflow_check/abnormal_scene.py +2 -0
msprobe/core/service.py +356 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +157 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +89 -30
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +184 -50
msprobe/docs/06.data_dump_MindSpore.md +193 -28
msprobe/docs/07.accuracy_checker_PyTorch.md +13 -3
msprobe/docs/08.accuracy_checker_online_PyTorch.md +72 -10
msprobe/docs/09.accuracy_checker_MindSpore.md +19 -7
msprobe/docs/10.accuracy_compare_PyTorch.md +266 -102
msprobe/docs/11.accuracy_compare_MindSpore.md +117 -43
msprobe/docs/12.overflow_check_PyTorch.md +5 -3
msprobe/docs/13.overflow_check_MindSpore.md +6 -4
msprobe/docs/14.data_parse_PyTorch.md +4 -10
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +3 -3
msprobe/docs/19.monitor.md +211 -103
msprobe/docs/21.visualization_PyTorch.md +100 -28
msprobe/docs/22.visualization_MindSpore.md +103 -31
msprobe/docs/23.generate_operator_PyTorch.md +9 -9
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +278 -8
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/28.kernel_dump_MindSpore.md +1 -1
msprobe/docs/29.data_dump_MSAdapter.md +229 -0
msprobe/docs/30.overflow_check_MSAdapter.md +31 -0
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +190 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/FAQ.md +3 -11
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/merge_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/docs/img/visualization/vis_browser_1.png +0 -0
msprobe/docs/img/visualization/vis_match_info.png +0 -0
msprobe/docs/img/visualization/vis_precision_info.png +0 -0
msprobe/docs/img/visualization/vis_search_info.png +0 -0
msprobe/docs/img/visualization/vis_show_info.png +0 -0
msprobe/docs/img/visualization/vis_showcase.png +0 -0
msprobe/docs/img/visualization/vis_unmatch_info.png +0 -0
msprobe/mindspore/__init__.py +3 -3
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +151 -55
msprobe/mindspore/api_accuracy_checker/api_runner.py +25 -11
msprobe/mindspore/api_accuracy_checker/base_compare_algorithm.py +2 -1
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +580 -0
msprobe/mindspore/api_accuracy_checker/bench_functions/fusion_operator.py +41 -0
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/data_manager.py +4 -3
msprobe/mindspore/api_accuracy_checker/generate_op_script/config_op.json +9 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +451 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +11 -1
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +73 -2
msprobe/mindspore/common/utils.py +157 -29
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +18 -398
msprobe/mindspore/compare/ms_graph_compare.py +20 -10
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +59 -7
msprobe/mindspore/debugger/precision_debugger.py +83 -90
msprobe/mindspore/dump/cell_dump_process.py +902 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +889 -0
msprobe/mindspore/dump/dump_tool_factory.py +18 -8
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +176 -0
msprobe/mindspore/dump/hook_cell/hook_cell.py +22 -12
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +42 -26
msprobe/mindspore/dump/jit_dump.py +35 -27
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -16
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +22 -12
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/free_benchmark/perturbation/perturbation_factory.py +4 -2
msprobe/mindspore/free_benchmark/self_check_tool_factory.py +6 -3
msprobe/mindspore/grad_probe/global_context.py +9 -2
msprobe/mindspore/grad_probe/grad_analyzer.py +2 -1
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/grad_probe/hook.py +2 -4
msprobe/mindspore/mindspore_service.py +111 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/distributed/wrap_distributed.py +1 -1
msprobe/mindspore/monitor/features.py +13 -1
msprobe/mindspore/monitor/module_hook.py +568 -444
msprobe/mindspore/monitor/optimizer_collect.py +331 -0
msprobe/mindspore/monitor/utils.py +71 -9
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/overflow_check/overflow_check_tool_factory.py +5 -3
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/api_precision_compare.py +3 -6
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +15 -13
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +206 -4
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +9 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +6 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +31 -9
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +28 -20
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/attl.py +3 -1
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +154 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/moe_gating_top_k_softmax.py +6 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +53 -19
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +50 -96
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +15 -61
msprobe/pytorch/dump/module_dump/module_processer.py +150 -114
msprobe/pytorch/free_benchmark/common/utils.py +1 -1
msprobe/pytorch/free_benchmark/compare/single_benchmark.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py +3 -3
msprobe/pytorch/free_benchmark/perturbed_layers/npu/change_value.py +1 -1
msprobe/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py +1 -1
msprobe/pytorch/free_benchmark/result_handlers/check_handler.py +1 -1
msprobe/pytorch/function_factory.py +1 -1
msprobe/pytorch/grad_probe/grad_monitor.py +2 -2
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +155 -0
msprobe/pytorch/hook_module/hook_module.py +18 -22
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/register_optimizer_hook.py +2 -1
msprobe/pytorch/hook_module/support_wrap_ops.yaml +193 -75
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +14 -4
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/distributed/wrap_distributed.py +8 -2
msprobe/pytorch/monitor/module_hook.py +336 -241
msprobe/pytorch/monitor/module_metric.py +17 -0
msprobe/pytorch/monitor/optimizer_collect.py +244 -224
msprobe/pytorch/monitor/utils.py +84 -4
msprobe/pytorch/online_dispatch/compare.py +0 -2
msprobe/pytorch/online_dispatch/dispatch.py +13 -2
msprobe/pytorch/online_dispatch/dump_compare.py +8 -2
msprobe/pytorch/online_dispatch/utils.py +3 -0
msprobe/pytorch/parse_tool/lib/interactive_cli.py +1 -6
msprobe/pytorch/parse_tool/lib/utils.py +5 -4
msprobe/pytorch/pt_config.py +16 -11
msprobe/pytorch/pytorch_service.py +70 -0
msprobe/visualization/builder/graph_builder.py +69 -10
msprobe/visualization/builder/msprobe_adapter.py +24 -12
msprobe/visualization/compare/graph_comparator.py +63 -51
msprobe/visualization/compare/mode_adapter.py +22 -20
msprobe/visualization/graph/base_node.py +11 -4
msprobe/visualization/graph/distributed_analyzer.py +1 -10
msprobe/visualization/graph/graph.py +2 -13
msprobe/visualization/graph/node_op.py +1 -2
msprobe/visualization/graph_service.py +251 -104
msprobe/visualization/utils.py +26 -44
msprobe/mindspore/dump/hook_cell/api_registry.py +0 -207
msprobe/mindspore/dump/hook_cell/wrap_api.py +0 -212
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -140
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -543
msprobe/pytorch/hook_module/api_registry.py +0 -166
msprobe/pytorch/hook_module/wrap_distributed.py +0 -79
msprobe/pytorch/hook_module/wrap_functional.py +0 -66
msprobe/pytorch/hook_module/wrap_npu_custom.py +0 -85
msprobe/pytorch/hook_module/wrap_tensor.py +0 -69
msprobe/pytorch/hook_module/wrap_torch.py +0 -84
msprobe/pytorch/hook_module/wrap_vf.py +0 -60
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -470
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.2.2.dist-info → mindstudio_probe-8.1.0.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/module_metric.py CHANGED Viewed

@@ -16,6 +16,7 @@ import re
 import torch
+from msprobe.pytorch.common.utils import is_float8_tensor
 from msprobe.pytorch.monitor.features import get_max, get_min, get_zeros, get_nans, get_norm, get_mean
 from msprobe.pytorch.monitor.utils import get_nan_tensor
@@ -143,6 +144,20 @@ class IdentMetric(Metric):
         return tensor
+@register_config_metric("shape")
+class ShapeMetric(Metric):
+    @staticmethod
+    def get_metric_value(tensor, eps):
+        return tensor.shape
+@register_config_metric("dtype")
+class DtypeMetric(Metric):
+    @staticmethod
+    def get_metric_value(tensor, eps):
+        return tensor.dtype
 def get_metrics(ops, tag2tensor, eps, out_dict=None):
     """
     :param ops: ["op1", "op2"]
@@ -166,6 +181,8 @@ def get_metrics(ops, tag2tensor, eps, out_dict=None):
             # Non-tensor in/output filled with nan.
             out_dict[tag].update({metric_name: get_nan_tensor() for metric_name in ops})
             continue
+        if is_float8_tensor(tensor):
+            tensor = tensor.float()
         for metric_name in ops:
             fun_metric = config_metric_registry.get(metric_name)
             out_dict[tag][metric_name] = fun_metric.get_metric(tensor, eps)

msprobe/pytorch/monitor/optimizer_collect.py CHANGED Viewed

@@ -12,129 +12,120 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections import defaultdict
+from abc import abstractmethod
 import torch
-import torch.distributed as dist
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.monitor.utils import MVResult, MVGradResult
+from msprobe.pytorch.monitor.utils import MVResult
+from msprobe.core.common.const import MonitorConst
 class OptimizerMon(object):
-    def __init__(self) -> None:
+    def __init__(self, torch_opt) -> None:
         self.fp16_to_fp32_param = {}
-        self.is_stage3 = False
+        self.torch_opt = torch_opt
+        self.state = {}
+    def narrow_from_flatten(self, param, flatten_state):
+        return flatten_state
+    def get_state(self, torch_opt):
+        if hasattr(torch_opt, 'chained_optimizers'):
+            for opt in torch_opt.chained_optimizers:
+                self._get_single_state(opt)
+        else:
+            self._get_single_state(torch_opt)
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        pass
+    def fetch_grad(self, monitor, params2name):
+        if not self.fp16_to_fp32_param:
+            self.map_fp16_to_fp32_param(self.torch_opt)
-    def _fetch_mv_in_adam(self, monitor, torch_opt, params2name):
-        exp_avg_dict = defaultdict(float)
-        exp_avg_sq_dict = defaultdict(float)
-        update_dict = defaultdict()
-        ratio_dict = defaultdict()
+        grad_dict = {}
+        first_param = True
         for param, name in params2name.items():
-            if param in self.fp16_to_fp32_param:
-                param = self.fp16_to_fp32_param[param]
-            if param in torch_opt.state:
-                state_param = torch_opt.state.get(param, None)
-                exp_avg = state_param.get("exp_avg", None)
-                exp_avg_sq = state_param.get("exp_avg_sq", None)
-                if exp_avg is None or exp_avg_sq is None:
-                    logger.warning(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.")
-                    continue
+            if monitor.duplicate_param.get(name, False):
+                continue
+            if self.fp16_to_fp32_param and param not in self.fp16_to_fp32_param:
+                continue
+            grad = param.main_grad if monitor.params_have_main_grad else param.grad
+            element_in_cur_partition = self.fp16_to_fp32_param.get(param, param).numel()
+            if param.numel() != element_in_cur_partition:
+                if first_param:
+                    grad = grad.flatten()[-element_in_cur_partition:]
+                else: # supposed to be the last one
+                    grad = grad.flatten()[:element_in_cur_partition]
+            first_param = False
+            if grad is None:
+                if not monitor.fsdp_wrapped_module:
+                    logger.warning(f"grad is None: {name}, maybe something wrong happened.")
+                continue
+            tag = monitor.name2tag.get(name, {}).get(MonitorConst.POST_GRAD)
+            monitor.register_param_call_id("hook_optimizer", tag)
+            grad_dict[tag] = grad
+        return grad_dict
+    def map_fp16_to_fp32_param(self, torch_opt):
+        pass
+    def fetch_mv(self, monitor, params2name):
+        if not self.fp16_to_fp32_param:
+            self.map_fp16_to_fp32_param(self.torch_opt)
+        if not self.state:
+            self.get_state(self.torch_opt)
+        exp_avg_dict = {}
+        exp_avg_sq_dict = {}
+        update_dict = {}
+        ratio_dict = {}
+        if not self.state:
+            logger.warning('optimizer state can not accessed')
+            return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict)
+        for lp_param, name in params2name.items():
+            if lp_param in self.fp16_to_fp32_param:
+                hp_param = self.fp16_to_fp32_param[lp_param]
+            else:
+                hp_param = lp_param
+            if hp_param in self.state:
+                state_param = self.state.get(hp_param, {})
+                exp_avg = self.narrow_from_flatten(lp_param, state_param.get("exp_avg", None))
+                exp_avg_sq = self.narrow_from_flatten(lp_param, state_param.get("exp_avg_sq", None))
                 if monitor.mv_distribution:
                     exp_avg_dict[name] = exp_avg
                     exp_avg_sq_dict[name] = exp_avg_sq
                 if monitor.mg_direction:
                     exp_avg_dict[name] = exp_avg
                 if monitor.ur_distribution:
-                    if len(torch_opt.param_groups) > 1:
-                        logger.info(f"the length of torch_opt.param_groups is {len(torch_opt.param_groups)}.")
+                    if len(self.torch_opt.param_groups) > 1:
+                        logger.info(f"the length of torch_opt.param_groups is {len(self.torch_opt.param_groups)}.")
                     if 'step' in state_param:
                         step = state_param['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
-                    elif 'step' in torch_opt.param_groups[0]:
-                        step = torch_opt.param_groups[0]['step']  # AdamW from mindspeed
+                    elif 'step' in self.torch_opt.param_groups[0]:
+                        step = self.torch_opt.param_groups[0]['step']  # AdamW from mindspeed
                     else:
                         logger.warning(f"step of {name} is None, maybe something wrong happened.")
                         continue
-                    exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
-                    exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
-                    update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
+                    exp_avg_hat = exp_avg / (1 - self.torch_opt.defaults['betas'][0] ** step)
+                    exp_avg_sq_hat = exp_avg_sq / (1 - self.torch_opt.defaults['betas'][1] ** step)
+                    update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + self.torch_opt.defaults['eps'])
                     ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
                     monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
                     monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
         return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict)
-    def _fetch_mv_grad_in_adam(self, monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat):
-        exp_avg_dict = defaultdict(float)
-        exp_avg_sq_dict = defaultdict(float)
-        update_dict = defaultdict()
-        ratio_dict = defaultdict()
-        param2name = defaultdict()
-        fp32_partitioned_groups_flat_grad = defaultdict()
-        partition_id = dist.get_rank()
-        def get_flatten_grad(self, optimizer, group_idx):
-            if fp32_partitioned_groups_flat[group_idx].grad is None:
-                if partition_id == dist.get_world_size() - 1 and not self.is_stage3:
-                    fp32_partitioned_groups_flat_grad = optimizer.flatten_dense_tensors_aligned(
-                        optimizer.averaged_gradients[group_idx],
-                        int(optimizer.partition_size[group_idx])
-                    ).to(fp32_partitioned_groups_flat[group_idx].dtype)
-                else:
-                    fp32_partitioned_groups_flat_grad = optimizer.flatten(
-                        optimizer.averaged_gradients[group_idx]
-                    ).to(fp32_partitioned_groups_flat[group_idx].dtype)
-                return fp32_partitioned_groups_flat_grad
-            else:
-                return fp32_partitioned_groups_flat[group_idx].grad
-        for group_idx in range(len(fp32_partitioned_groups_flat)):
-            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, torch_opt, group_idx)
-        for name in params2name.values():
-            start_idx, end_idx, group_idx, group_with_rank = name2indices[name]
-            if group_with_rank != partition_id and isinstance(group_with_rank, int):
-                continue
-            fp32_param = fp32_partitioned_groups_flat[group_idx][start_idx: end_idx]
-            fp32_param.grad = fp32_partitioned_groups_flat_grad[group_idx][start_idx: end_idx]
-            param2name[fp32_param] = name
-            if not torch_opt.state:
-                continue
-            state_param = list(torch_opt.state.values())[group_idx]
-            exp_avg = state_param.get("exp_avg", None)
-            exp_avg_sq = state_param.get("exp_avg_sq", None)
-            if exp_avg is None or exp_avg_sq is None:
-                logger.warning(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.")
-                continue
-            exp_avg = exp_avg[start_idx: end_idx]
-            exp_avg_sq = exp_avg_sq[start_idx: end_idx]
-            if monitor.mv_distribution:
-                exp_avg_dict[name] = exp_avg
-                exp_avg_sq_dict[name] = exp_avg_sq
-            if monitor.mg_direction:
-                exp_avg_dict[name] = exp_avg
-            if monitor.ur_distribution:
-                if 'step' in state_param:
-                    step = state_param['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
-                elif 'step' in torch_opt.param_groups[group_idx]:
-                    step = torch_opt.param_groups[group_idx]['step']  # AdamW from mindspeed
-                else:
-                    logger.warning(f"step of {name} is None, maybe something wrong happened.")
-                    continue
-                exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
-                exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
-                update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
-                ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
-                monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
-                monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
-        del fp32_partitioned_groups_flat_grad
-        return MVGradResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict,
-                            grad=param2name)
+    def _get_single_state(self, torch_opt):
+        state = {}
+        if hasattr(torch_opt, 'param_to_cpu_states_map'):
+            state = torch_opt.param_to_cpu_states_map
+        elif hasattr(torch_opt, 'state'):
+            state = torch_opt.state
+        elif hasattr(torch_opt, 'optimizer') and hasattr(torch_opt.optimizer, 'state'):
+            state = torch_opt.optimizer.state
+        self.state.update(state)
 class MixPrecisionOptimizerMon(OptimizerMon):
@@ -142,21 +133,14 @@ class MixPrecisionOptimizerMon(OptimizerMon):
     混合精度优化器监控类。在混合精度训练中监控和管理优化器。
     混合精度训练通过适当降低某些计算的精度来加速训练过程并减少内存消耗。
     """
-    def map_fp16_tp_fp32_param(self, torch_opt):
+    def map_fp16_to_fp32_param(self, torch_opt):
         for fp16_group, fp32_group in zip(torch_opt.float16_groups, torch_opt.fp32_from_float16_groups):
             for fp16_param, fp32_param in zip(fp16_group, fp32_group):
                 self.fp16_to_fp32_param[fp16_param] = fp32_param
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            self.map_fp16_tp_fp32_param(torch_opt)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class MegatronDistributedOptimizerMon(OptimizerMon):
-    def map_fp16_tp_fp32_param(self, torch_opt):
+    def map_fp16_to_fp32_param(self, torch_opt):
         if not (hasattr(torch_opt, "model_float16_groups") and
                 hasattr(torch_opt, "shard_fp32_from_float16_groups")):
             raise Exception(
@@ -167,141 +151,176 @@ class MegatronDistributedOptimizerMon(OptimizerMon):
             for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
                 self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            self.map_fp16_tp_fp32_param(torch_opt)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+class MegatronChainedDistributedOptimizerMon(MegatronDistributedOptimizerMon):
+    def map_fp16_to_fp32_param(self, torch_opt):
+        for opt in torch_opt.chained_optimizers:
+            super().map_fp16_to_fp32_param(opt)
-class MegatronFP32OptimizerMon(OptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+class MegatronChainedMixPrecisionOptimizerMon(MixPrecisionOptimizerMon):
+    def map_fp16_to_fp32_param(self, torch_opt):
+        for opt in torch_opt.chained_optimizers:
+            super().map_fp16_to_fp32_param(opt)
-class MegatronChainedDistributedOptimizerMon(MegatronDistributedOptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            for opt in torch_opt.chained_optimizers:
-                self.map_fp16_tp_fp32_param(opt)
-        if not isinstance(torch_opt, torch.optim.Optimizer):
-            torch_opt.state = {}
-            for opt in torch_opt.chained_optimizers:
-                torch_opt.state.update(opt.optimizer.state)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+class DeepSpeedZeroOptimizerMon(OptimizerMon):
+    """
+    Base monitor class for DeepSpeed ZeRO optimizer.
+    ZeRO stage 0 no partition
+    ZeRO stage 1 partitions optimizer states across data parallel processes.
+    ZeRO stage 2 additionally partitions gradients.
+    ZeRO stage 3 additionally partitions parameters.
+    This class provides monitoring capabilities for ZeRO optimizers by:
+    - Handling gradient collection for different ZeRO stages
+    - Managing optimizer state access for monitoring
+    """
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = ''
+        self.bit16_groups = []
+        self.fp32_flat_groups = []
+        self.param2group = ()
+        self.param2index = []
+        self.group_offset = {}
+    @abstractmethod
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        raise NotImplementedError
+    def param_not_in_partition(self, lp_param, group_idx):
+        param_slice_mapping = self.torch_opt.state_dict()['param_slice_mappings'][group_idx]
+        hp_address = param_slice_mapping.get(self.torch_opt.param_names.get(lp_param))
+        return hp_address is None
+    def get_position(self, lp_param, group_idx):
+        param_slice_mapping = self.torch_opt.state_dict()['param_slice_mappings'][group_idx]
+        hp_address = param_slice_mapping.get(self.torch_opt.param_names.get(lp_param))
+        return hp_address.start, hp_address.numel
+    def get_group_index(self):
+        param2group = {}
+        for group_idx, bit16_group in enumerate(self.bit16_groups):
+            for param in bit16_group:
+                param2group[param] = group_idx
+        return param2group
+    def get_param_index(self, lp_param, group_idx):
+        if not self.param2index:
+            for group in self.bit16_groups:
+                param2index = {}
+                for index, param in enumerate(group):
+                    param2index[param] = index
+                self.param2index.append(param2index)
+        return self.param2index[group_idx][lp_param]
+    def narrow_from_flatten(self, param, flatten_state):
+        if flatten_state is None:
+            return flatten_state
+        group_idx = self.param2group[param]
+        if self.param_not_in_partition(param, group_idx):
+            return None
+        start, numel = self.get_position(param, group_idx)
+        return flatten_state.narrow(0, start, numel)
+    def map_fp16_to_fp32_param(self, torch_opt):
+        for group_idx, group in enumerate(self.bit16_groups):
+            for param in group:
+                self.fp16_to_fp32_param[param] = self.fp32_flat_groups[group_idx]
+    def fetch_grad(self, monitor, params2name):
+        grad_dict = {}
+        for lp_param, name in params2name.items():
+            group_idx = self.param2group[lp_param]
+            param_id = self.get_param_index(lp_param, group_idx)
+            if self.param_not_in_partition(lp_param, group_idx):
+                continue
+            if self.stage == '1or2':
+                param_id = param_id - self.group_offset[group_idx] - 1
+            grad = self.get_grad_for_param(lp_param, group_idx, param_id)
+            tag = monitor.name2tag.get(name, {}).get(MonitorConst.POST_GRAD)
+            monitor.register_param_call_id("hook_optimizer", tag)
+            grad_dict[tag] = grad
+        return grad_dict
+class DeepSpeedZeroOptimizerStage0Mon(DeepSpeedZeroOptimizerMon):
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = '0'
+        self.bit16_groups = torch_opt.bf16_groups
+        self.fp32_flat_groups = torch_opt.fp32_groups_flat_partition
+        self.param2group = self.get_group_index()
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        return self.torch_opt.fp32_groups_gradient_dict[group_idx][param_id]
+class DeepSpeedZeroOptimizerStage1or2Mon(DeepSpeedZeroOptimizerMon):
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = '1or2'
+        self.bit16_groups = torch_opt.bit16_groups
+        self.fp32_flat_groups = torch_opt.single_partition_of_fp32_groups
+        self.param2group = self.get_group_index()
+        self.group_offset = {}
+        self.get_group_offset()
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        if getattr(self.torch_opt, "cpu_offload", False):
+            grads = self.torch_opt.single_partition_of_fp32_groups[group_idx].grad
+            start, numel = self.get_position(lp_param, group_idx)
+            grad = grads.narrow(0, start, numel)
+        else:
+            grad = self.torch_opt.averaged_gradients[group_idx][param_id]
+        return grad
+    def get_group_offset(self):
+        for group_idx, group in enumerate(self.bit16_groups):
+            self.group_offset[group_idx] = -1
+            for lp_param in group:
+                if self.param_not_in_partition(lp_param, group_idx):
+                    self.group_offset[group_idx] = self.get_param_index(lp_param, group_idx)
+                else:
+                    break
-class MegatronChainedMixPrecisionOptimizerMon(MixPrecisionOptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            for opt in torch_opt.chained_optimizers:
-                self.map_fp16_tp_fp32_param(opt)
+class DeepSpeedZeroOptimizerStage3Mon(DeepSpeedZeroOptimizerMon):
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = '3'
+        self.bit16_groups = torch_opt.fp16_groups
+        self.fp32_flat_groups = torch_opt.fp32_partitioned_groups_flat
+        self.param2group = self.get_group_index()
-        if not isinstance(torch_opt, torch.optim.Optimizer):
-            torch_opt.state = {}
-            for opt in torch_opt.chained_optimizers:
-                torch_opt.state.update(opt.optimizer.state)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
-class DeepSpeedZeroOptimizerStage0Mon(OptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
-class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
-    def get_param_index(self, params2name, name2index, torch_opt):
-        fp16_groups = torch_opt.fp16_partitioned_groups
-        name2indices = defaultdict()
-        index_length = defaultdict()
-        index = 0
-        idx = 0
-        for group_idx, fp16_group in enumerate(fp16_groups):
-            for param in fp16_group:
-                param_length = len(param.flatten())
-                index_length[idx] = (index, index + param_length, group_idx)
-                index += param_length
-                idx += 1
-        for _, name in params2name.items():
-            idx = name2index[name]
-            start_idx, end_idx, group_idx = index_length[idx]
-            name2indices[name] = (start_idx, end_idx, group_idx, None)
-        return name2indices
-    def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        self.is_stage3 = True
-        fp32_partitioned_groups_flat = torch_opt.fp32_partitioned_groups_flat
-        return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
-class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
-    @staticmethod
-    def get_group_index(fp32_length, world_size, index):
-        for i in range(len(fp32_length) - 1):
-            if fp32_length[i] <= index < fp32_length[i + 1]:
-                interval_start = fp32_length[i]
-                interval_length = fp32_length[i + 1] - fp32_length[i]
-                sub_interval_length = interval_length // world_size
-                sub_index = (index - interval_start) // sub_interval_length
-                sub_interval_start = interval_start + sub_index * sub_interval_length
-                return sub_interval_start, min(sub_index, world_size - 1)
-        return fp32_length[-1], 0
-    def get_param_index(self, params2name, name2index, torch_opt):
-        padding = torch_opt.groups_padding
-        world_size = dist.get_world_size()
-        fp32_length = [0]
-        for fp32_group_index, single_partition_of_fp32_group in enumerate(torch_opt.single_partition_of_fp32_groups):
-            fp32_length.append(len(single_partition_of_fp32_group) * world_size + fp32_length[fp32_group_index])
-        bf16_groups = []
-        name2indices = defaultdict()
-        index_length = defaultdict()
-        index = 0
-        idx = 0
-        for group_idx, bf16_group in enumerate(torch_opt.bit16_groups):
-            bf16_groups.extend(bf16_group)
-            for param in bf16_group:
-                param_length = len(param.flatten())
-                group_index, group_with_rank = self.get_group_index(fp32_length, world_size, index)
-                index_length[idx] = (index, index + param_length, group_idx, group_index, group_with_rank)
-                index += param_length
-                idx += 1
-        group_length = len(bf16_groups) / len(torch_opt.bit16_groups)
-        for _, name in params2name.items():
-            name_index = name2index[name]
-            start_idx, end_idx, group_idx, group_index, group_with_rank = index_length[name_index]
-            need_padding = True if group_with_rank == world_size - 1 else False
-            new_start_idx = start_idx - group_index
-            new_end_idx = end_idx - group_index
-            if need_padding and group_length - 1 <= name_index <= len(bf16_groups) - 1 and name_index % (
-                    group_length - 1) == 0:
-                new_end_idx -= padding[int(name_index // (group_length - 1) - 1)]
-            name2indices[name] = (new_start_idx, new_end_idx, group_idx, group_with_rank)
-        return name2indices
-    def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        fp32_partitioned_groups_flat = torch_opt.single_partition_of_fp32_groups
-        return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
-class DummyOptimizerMon(OptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+    def param_not_in_partition(self, param, group_index):
+        """Each param partioned across all zero ranks"""
+        return False
+    def get_position(self, lp_param, group_idx):
+        param_id = self.torch_opt.get_param_id(lp_param)
+        return self.torch_opt.grad_position[param_id][1:]
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        return self.torch_opt.averaged_gradients[group_idx][param_id]
 class OptimizerMonFactory:
     _optimizer_mon_map = {
-        "FP32Optimizer": MegatronFP32OptimizerMon,
+        "FP32Optimizer": OptimizerMon,
         "Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
         "DistributedOptimizer": MegatronDistributedOptimizerMon,
+        "SwapDistributedOptimizer": MegatronDistributedOptimizerMon,
         "ChainedDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
+        "ChainedSwapDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
         "ChainedFloat16OptimizerWithFloat16Params": MegatronChainedMixPrecisionOptimizerMon,
         "BF16_Optimizer": DeepSpeedZeroOptimizerStage0Mon,
         "DeepSpeedZeroOptimizer": DeepSpeedZeroOptimizerStage1or2Mon,
         "DeepSpeedZeroOptimizer_Stage3": DeepSpeedZeroOptimizerStage3Mon,
-        "Adam": DummyOptimizerMon
+        "Adam": OptimizerMon
     }
     @staticmethod
@@ -310,6 +329,7 @@ class OptimizerMonFactory:
         optimizer_class = optimizer.__class__.__name__
         if optimizer_class == "ChainedOptimizer":
             optimizer_class = "Chained" + optimizer.chained_optimizers[0].__class__.__name__
+        logger.info(f'The optimizer type is {optimizer_class}')
-        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(optimizer_class, DummyOptimizerMon)
-        return optimizer_mon_class(), optimizer_class
+        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(optimizer_class, OptimizerMon)
+        return optimizer_mon_class(optimizer)

mindstudio-probe 1.2.2__py3-none-any.whl → 8.1.0__py3-none-any.whl

mindstudio-probe 1.2.2py3-none-any.whl → 8.1.0py3-none-any.whl