PyPI - mindstudio-probe - Versions diffs - 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/METADATA +4 -2
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/RECORD +204 -152
msprobe/README.md +32 -1
msprobe/core/__init__.py +17 -0
msprobe/core/common/const.py +120 -21
msprobe/core/common/exceptions.py +2 -2
msprobe/core/common/file_utils.py +279 -50
msprobe/core/common/framework_adapter.py +169 -0
msprobe/core/common/global_lock.py +86 -0
msprobe/core/common/runtime.py +25 -0
msprobe/core/common/utils.py +136 -45
msprobe/core/common_config.py +7 -0
msprobe/core/compare/acc_compare.py +646 -428
msprobe/core/compare/check.py +36 -103
msprobe/core/compare/compare_cli.py +4 -0
msprobe/core/compare/config.py +72 -0
msprobe/core/compare/highlight.py +215 -215
msprobe/core/compare/layer_mapping/layer_mapping.py +2 -0
msprobe/core/compare/merge_result/merge_result.py +4 -4
msprobe/core/compare/multiprocessing_compute.py +223 -110
msprobe/core/compare/npy_compare.py +2 -4
msprobe/core/compare/utils.py +214 -244
msprobe/core/config_check/__init__.py +17 -0
msprobe/{pytorch/dump/kernel_dump/kernel_config.py → core/config_check/checkers/__init__.py} +8 -16
msprobe/core/config_check/checkers/base_checker.py +60 -0
msprobe/core/config_check/checkers/dataset_checker.py +138 -0
msprobe/core/config_check/checkers/env_args_checker.py +96 -0
msprobe/core/config_check/checkers/hyperparameter_checker.py +170 -0
msprobe/core/config_check/checkers/pip_checker.py +90 -0
msprobe/core/config_check/checkers/random_checker.py +367 -0
msprobe/core/config_check/checkers/weights_checker.py +147 -0
msprobe/core/config_check/ckpt_compare/ckpt_comparator.py +74 -0
msprobe/core/config_check/ckpt_compare/megatron_loader.py +302 -0
msprobe/core/config_check/ckpt_compare/metrics.py +83 -0
msprobe/core/config_check/ckpt_compare/name_mapping.yaml +12 -0
msprobe/core/config_check/config_check_cli.py +51 -0
msprobe/core/config_check/config_checker.py +100 -0
msprobe/{mindspore/runtime.py → core/config_check/resource/dependency.yaml} +7 -4
msprobe/core/config_check/resource/env.yaml +57 -0
msprobe/core/config_check/resource/hyperparameter.yaml +21 -0
msprobe/core/config_check/utils/hyperparameter_parser.py +115 -0
msprobe/core/config_check/utils/utils.py +107 -0
msprobe/core/data_dump/api_registry.py +67 -4
msprobe/core/data_dump/data_collector.py +170 -89
msprobe/core/data_dump/data_processor/base.py +72 -51
msprobe/core/data_dump/data_processor/mindspore_processor.py +109 -55
msprobe/core/data_dump/data_processor/pytorch_processor.py +90 -82
msprobe/core/data_dump/json_writer.py +143 -27
msprobe/core/debugger/precision_debugger.py +144 -0
msprobe/core/grad_probe/constant.py +1 -1
msprobe/core/grad_probe/grad_compare.py +1 -1
msprobe/core/grad_probe/utils.py +1 -1
msprobe/core/hook_manager.py +242 -0
msprobe/core/monitor/anomaly_processor.py +384 -0
msprobe/core/service.py +357 -0
msprobe/core/single_save/__init__.py +0 -0
msprobe/core/single_save/single_comparator.py +243 -0
msprobe/core/single_save/single_saver.py +146 -0
msprobe/docs/01.installation.md +6 -5
msprobe/docs/02.config_introduction.md +79 -22
msprobe/docs/03.config_examples.md +1 -0
msprobe/docs/04.kernel_dump_PyTorch.md +1 -1
msprobe/docs/05.data_dump_PyTorch.md +118 -49
msprobe/docs/06.data_dump_MindSpore.md +167 -20
msprobe/docs/07.accuracy_checker_PyTorch.md +2 -2
msprobe/docs/08.accuracy_checker_online_PyTorch.md +69 -9
msprobe/docs/09.accuracy_checker_MindSpore.md +18 -6
msprobe/docs/10.accuracy_compare_PyTorch.md +212 -74
msprobe/docs/11.accuracy_compare_MindSpore.md +87 -37
msprobe/docs/12.overflow_check_PyTorch.md +2 -2
msprobe/docs/13.overflow_check_MindSpore.md +2 -2
msprobe/docs/14.data_parse_PyTorch.md +3 -3
msprobe/docs/17.grad_probe.md +2 -1
msprobe/docs/18.online_dispatch.md +2 -2
msprobe/docs/19.monitor.md +90 -44
msprobe/docs/21.visualization_PyTorch.md +68 -15
msprobe/docs/22.visualization_MindSpore.md +71 -18
msprobe/docs/25.tool_function_introduction.md +23 -22
msprobe/docs/26.data_dump_PyTorch_baseline.md +14 -3
msprobe/docs/27.dump_json_instruction.md +1 -1
msprobe/docs/28.debugger_save_instruction.md +111 -20
msprobe/docs/29.data_dump_MSAdapter.md +2 -2
msprobe/docs/30.overflow_check_MSAdapter.md +2 -2
msprobe/docs/31.config_check.md +95 -0
msprobe/docs/32.ckpt_compare.md +69 -0
msprobe/docs/33.generate_operator_MindSpore.md +181 -0
msprobe/docs/34.RL_collect.md +92 -0
msprobe/docs/35.nan_analyze.md +72 -0
msprobe/docs/data_dump_MindSpore/data_dump_MindSpore_baseline.md +12 -1
msprobe/docs/data_dump_MindSpore/dynamic_graph_quick_start_example.md +3 -1
msprobe/docs/img/compare_result.png +0 -0
msprobe/docs/img/save_compare_result_sample.png +0 -0
msprobe/docs/img/visualization/proxy.png +0 -0
msprobe/mindspore/__init__.py +1 -2
msprobe/mindspore/api_accuracy_checker/api_accuracy_checker.py +150 -58
msprobe/mindspore/api_accuracy_checker/api_runner.py +7 -3
msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +47 -69
msprobe/mindspore/api_accuracy_checker/cmd_parser.py +4 -0
msprobe/mindspore/api_accuracy_checker/compute_element.py +0 -1
msprobe/mindspore/api_accuracy_checker/data_manager.py +2 -2
msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +460 -0
msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +2081 -0
msprobe/mindspore/api_accuracy_checker/multi_api_accuracy_checker.py +9 -0
msprobe/mindspore/api_accuracy_checker/torch_mindtorch_importer.py +2 -1
msprobe/mindspore/cell_processor.py +204 -33
msprobe/mindspore/code_mapping/graph_parser.py +4 -21
msprobe/mindspore/common/const.py +17 -7
msprobe/mindspore/common/utils.py +128 -11
msprobe/mindspore/compare/common_dir_compare.py +382 -0
msprobe/mindspore/compare/distributed_compare.py +2 -26
msprobe/mindspore/compare/ms_compare.py +17 -405
msprobe/mindspore/compare/ms_graph_compare.py +14 -5
msprobe/mindspore/compare/utils.py +37 -0
msprobe/mindspore/debugger/debugger_config.py +53 -3
msprobe/mindspore/debugger/precision_debugger.py +72 -91
msprobe/mindspore/dump/cell_dump_process.py +877 -0
msprobe/mindspore/dump/cell_dump_with_insert_gradient.py +864 -0
msprobe/mindspore/dump/dump_tool_factory.py +13 -5
msprobe/mindspore/dump/graph_mode_cell_dump.py +139 -0
msprobe/mindspore/dump/graph_tensor_dump.py +123 -0
msprobe/mindspore/dump/hook_cell/api_register.py +40 -6
msprobe/mindspore/dump/hook_cell/hook_cell.py +18 -7
msprobe/mindspore/dump/hook_cell/ms_hook_manager.py +88 -0
msprobe/mindspore/dump/hook_cell/primitive_hooks.py +8 -2
msprobe/mindspore/dump/hook_cell/support_wrap_ops.yaml +18 -0
msprobe/mindspore/dump/jit_dump.py +21 -18
msprobe/mindspore/dump/kernel_kbyk_dump.py +6 -3
msprobe/mindspore/dym_loader/hook_dynamic_loader.cpp +110 -0
msprobe/mindspore/dym_loader/hook_dynamic_loader.h +15 -15
msprobe/mindspore/free_benchmark/api_pynative_self_check.py +12 -6
msprobe/mindspore/free_benchmark/common/utils.py +1 -1
msprobe/mindspore/grad_probe/global_context.py +7 -2
msprobe/mindspore/grad_probe/grad_stat_csv.py +3 -2
msprobe/mindspore/mindspore_service.py +114 -0
msprobe/mindspore/monitor/common_func.py +52 -0
msprobe/mindspore/monitor/data_writers.py +237 -0
msprobe/mindspore/monitor/features.py +20 -7
msprobe/mindspore/monitor/module_hook.py +281 -209
msprobe/mindspore/monitor/optimizer_collect.py +334 -0
msprobe/mindspore/monitor/utils.py +25 -5
msprobe/mindspore/ms_config.py +16 -15
msprobe/mindspore/task_handler_factory.py +5 -2
msprobe/msprobe.py +19 -0
msprobe/nan_analyze/__init__.py +14 -0
msprobe/nan_analyze/analyzer.py +255 -0
msprobe/nan_analyze/graph.py +189 -0
msprobe/nan_analyze/utils.py +211 -0
msprobe/pytorch/api_accuracy_checker/common/config.py +2 -2
msprobe/pytorch/api_accuracy_checker/compare/compare.py +36 -34
msprobe/pytorch/api_accuracy_checker/compare/compare_utils.py +20 -20
msprobe/pytorch/api_accuracy_checker/generate_op_script/op_generator.py +4 -7
msprobe/pytorch/api_accuracy_checker/generate_op_script/operator_replication.template +204 -2
msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +12 -11
msprobe/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py +1 -0
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut.py +8 -5
msprobe/pytorch/api_accuracy_checker/run_ut/run_ut_utils.py +2 -3
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/client.py +29 -13
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/device_dispatch.py +12 -2
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/server.py +45 -31
msprobe/pytorch/api_accuracy_checker/tensor_transport_layer/utils.py +156 -0
msprobe/pytorch/attl_manager.py +65 -0
msprobe/pytorch/bench_functions/npu_fusion_attention.py +27 -0
msprobe/pytorch/common/utils.py +26 -14
msprobe/pytorch/compare/distributed_compare.py +4 -36
msprobe/pytorch/compare/pt_compare.py +13 -84
msprobe/pytorch/compare/utils.py +47 -0
msprobe/pytorch/debugger/debugger_config.py +34 -17
msprobe/pytorch/debugger/precision_debugger.py +66 -118
msprobe/pytorch/dump/module_dump/hook_wrapper.py +93 -0
msprobe/pytorch/dump/module_dump/module_dump.py +11 -58
msprobe/pytorch/dump/module_dump/module_processer.py +143 -113
msprobe/pytorch/grad_probe/grad_stat_csv.py +3 -2
msprobe/pytorch/hook_module/api_register.py +29 -5
msprobe/pytorch/hook_module/hook_module.py +9 -18
msprobe/pytorch/hook_module/jit_script_wrapper.py +33 -0
msprobe/pytorch/hook_module/pt_hook_manager.py +68 -0
msprobe/pytorch/hook_module/support_wrap_ops.yaml +22 -1
msprobe/pytorch/hook_module/utils.py +28 -2
msprobe/pytorch/monitor/csv2tb.py +6 -2
msprobe/pytorch/monitor/data_writers.py +259 -0
msprobe/pytorch/monitor/module_hook.py +227 -158
msprobe/pytorch/monitor/module_metric.py +14 -0
msprobe/pytorch/monitor/optimizer_collect.py +242 -270
msprobe/pytorch/monitor/utils.py +16 -3
msprobe/pytorch/online_dispatch/dispatch.py +4 -2
msprobe/pytorch/online_dispatch/dump_compare.py +5 -2
msprobe/pytorch/parse_tool/lib/utils.py +3 -3
msprobe/pytorch/pt_config.py +8 -7
msprobe/pytorch/pytorch_service.py +73 -0
msprobe/visualization/builder/graph_builder.py +33 -13
msprobe/visualization/builder/msprobe_adapter.py +24 -11
msprobe/visualization/compare/graph_comparator.py +53 -45
msprobe/visualization/compare/mode_adapter.py +31 -1
msprobe/visualization/graph/base_node.py +3 -3
msprobe/visualization/graph/graph.py +2 -2
msprobe/visualization/graph_service.py +250 -103
msprobe/visualization/utils.py +27 -11
msprobe/mindspore/dym_loader/hook_dynamic_loader.cc +0 -106
msprobe/mindspore/monitor/anomaly_detect.py +0 -404
msprobe/mindspore/monitor/module_spec_verifier.py +0 -94
msprobe/mindspore/service.py +0 -549
msprobe/pytorch/monitor/anomaly_analyse.py +0 -201
msprobe/pytorch/monitor/anomaly_detect.py +0 -410
msprobe/pytorch/monitor/module_spec_verifier.py +0 -95
msprobe/pytorch/monitor/unittest/test_monitor.py +0 -160
msprobe/pytorch/service.py +0 -473
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/LICENSE +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/WHEEL +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/entry_points.txt +0 -0
{mindstudio_probe-1.3.0.dist-info → mindstudio_probe-8.1.1.dist-info}/top_level.txt +0 -0
/msprobe/{mindspore → core}/compare/ms_to_pt_api.yaml +0 -0
/msprobe/{mindspore/dump → core}/kernel_dump/kernel_config.py +0 -0
/msprobe/{pytorch/monitor/unittest → core/monitor}/__init__.py +0 -0

msprobe/pytorch/monitor/module_metric.py CHANGED Viewed

@@ -144,6 +144,20 @@ class IdentMetric(Metric):
         return tensor
+@register_config_metric("shape")
+class ShapeMetric(Metric):
+    @staticmethod
+    def get_metric_value(tensor, eps):
+        return tensor.shape
+@register_config_metric("dtype")
+class DtypeMetric(Metric):
+    @staticmethod
+    def get_metric_value(tensor, eps):
+        return tensor.dtype
 def get_metrics(ops, tag2tensor, eps, out_dict=None):
     """
     :param ops: ["op1", "op2"]

msprobe/pytorch/monitor/optimizer_collect.py CHANGED Viewed

@@ -12,129 +12,123 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections import defaultdict
+from abc import abstractmethod
 import torch
-import torch.distributed as dist
 from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.monitor.utils import MVResult, MVGradResult
+from msprobe.pytorch.monitor.utils import MVResult
+from msprobe.core.common.const import MonitorConst
 class OptimizerMon(object):
-    def __init__(self) -> None:
+    def __init__(self, torch_opt) -> None:
         self.fp16_to_fp32_param = {}
-        self.is_stage3 = False
+        self.torch_opt = torch_opt
+        self.state = {}
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        pass
+    def narrow_from_flatten(self, param, flatten_state):
+        return flatten_state
+    def get_state(self, torch_opt):
+        if hasattr(torch_opt, 'chained_optimizers'):
+            for opt in torch_opt.chained_optimizers:
+                self._get_single_state(opt)
+        else:
+            self._get_single_state(torch_opt)
-    def _fetch_mv_in_adam(self, monitor, torch_opt, params2name):
-        exp_avg_dict = defaultdict(float)
-        exp_avg_sq_dict = defaultdict(float)
-        update_dict = defaultdict()
-        ratio_dict = defaultdict()
+    def fetch_grad(self, monitor, params2name):
+        if not self.fp16_to_fp32_param:
+            self.map_fp16_to_fp32_param(self.torch_opt)
+        grad_dict = {}
+        first_param = True
         for param, name in params2name.items():
-            if param in self.fp16_to_fp32_param:
-                param = self.fp16_to_fp32_param[param]
-            if param in torch_opt.state:
-                state_param = torch_opt.state.get(param, None)
-                exp_avg = state_param.get("exp_avg", None)
-                exp_avg_sq = state_param.get("exp_avg_sq", None)
-                if exp_avg is None or exp_avg_sq is None:
-                    logger.warning(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.")
-                    continue
+            if monitor.duplicate_param.get(name, False):
+                continue
+            if self.fp16_to_fp32_param and param not in self.fp16_to_fp32_param:
+                continue
+            grad = param.main_grad if monitor.params_have_main_grad else param.grad
+            element_in_cur_partition = self.fp16_to_fp32_param.get(param, param).numel()
+            if param.numel() != element_in_cur_partition:
+                if first_param:
+                    grad = grad.flatten()[-element_in_cur_partition:]
+                else: # supposed to be the last one
+                    grad = grad.flatten()[:element_in_cur_partition]
+            first_param = False
+            if grad is None:
+                if not monitor.fsdp_wrapped_module:
+                    logger.warning(f"grad is None: {name}, maybe something wrong happened.")
+                continue
+            tag = monitor.name2tag.get(name, {}).get(MonitorConst.POST_GRAD)
+            monitor.register_param_call_id("hook_optimizer", tag)
+            grad_dict[tag] = grad
+        return grad_dict
+    def map_fp16_to_fp32_param(self, torch_opt):
+        pass
+    def fetch_mv(self, monitor, params2name):
+        if not self.fp16_to_fp32_param:
+            self.map_fp16_to_fp32_param(self.torch_opt)
+        if not self.state:
+            self.get_state(self.torch_opt)
+        exp_avg_dict = {}
+        exp_avg_sq_dict = {}
+        update_dict = {}
+        ratio_dict = {}
+        if not self.state:
+            logger.warning('optimizer state can not accessed')
+            return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict)
+        for lp_param, name in params2name.items():
+            if lp_param in self.fp16_to_fp32_param:
+                hp_param = self.fp16_to_fp32_param[lp_param]
+            else:
+                hp_param = lp_param
+            if hp_param in self.state:
+                state_param = self.state.get(hp_param, {})
+                exp_avg = self.narrow_from_flatten(lp_param, state_param.get("exp_avg", None))
+                exp_avg_sq = self.narrow_from_flatten(lp_param, state_param.get("exp_avg_sq", None))
                 if monitor.mv_distribution:
                     exp_avg_dict[name] = exp_avg
                     exp_avg_sq_dict[name] = exp_avg_sq
                 if monitor.mg_direction:
                     exp_avg_dict[name] = exp_avg
                 if monitor.ur_distribution:
-                    if len(torch_opt.param_groups) > 1:
-                        logger.info(f"the length of torch_opt.param_groups is {len(torch_opt.param_groups)}.")
+                    if len(self.torch_opt.param_groups) > 1:
+                        logger.info(f"the length of torch_opt.param_groups is {len(self.torch_opt.param_groups)}.")
                     if 'step' in state_param:
                         step = state_param['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
-                    elif 'step' in torch_opt.param_groups[0]:
-                        step = torch_opt.param_groups[0]['step']  # AdamW from mindspeed
+                    elif 'step' in self.torch_opt.param_groups[0]:
+                        step = self.torch_opt.param_groups[0]['step']  # AdamW from mindspeed
                     else:
                         logger.warning(f"step of {name} is None, maybe something wrong happened.")
                         continue
-                    exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
-                    exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
-                    update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
+                    if exp_avg is None or exp_avg_sq is None:
+                        logger.warning(f"exp_avg or exp_avg_sq of {name} is None, skip calculation.")
+                        continue
+                    exp_avg_hat = exp_avg / (1 - self.torch_opt.defaults['betas'][0] ** step)
+                    exp_avg_sq_hat = exp_avg_sq / (1 - self.torch_opt.defaults['betas'][1] ** step)
+                    update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + self.torch_opt.defaults['eps'])
                     ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
                     monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
                     monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
         return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict)
-    def _fetch_mv_grad_in_adam(self, monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat):
-        exp_avg_dict = defaultdict(float)
-        exp_avg_sq_dict = defaultdict(float)
-        update_dict = defaultdict()
-        ratio_dict = defaultdict()
-        param2name = defaultdict()
-        fp32_partitioned_groups_flat_grad = defaultdict()
-        partition_id = dist.get_rank()
-        def get_flatten_grad(self, optimizer, group_idx):
-            if fp32_partitioned_groups_flat[group_idx].grad is None:
-                if partition_id == dist.get_world_size() - 1 and not self.is_stage3:
-                    fp32_partitioned_groups_flat_grad = optimizer.flatten_dense_tensors_aligned(
-                        optimizer.averaged_gradients[group_idx],
-                        int(optimizer.partition_size[group_idx])
-                    ).to(fp32_partitioned_groups_flat[group_idx].dtype)
-                else:
-                    fp32_partitioned_groups_flat_grad = optimizer.flatten(
-                        optimizer.averaged_gradients[group_idx]
-                    ).to(fp32_partitioned_groups_flat[group_idx].dtype)
-                return fp32_partitioned_groups_flat_grad
-            else:
-                return fp32_partitioned_groups_flat[group_idx].grad
-        for group_idx in range(len(fp32_partitioned_groups_flat)):
-            fp32_partitioned_groups_flat_grad[group_idx] = get_flatten_grad(self, torch_opt, group_idx)
-        for name in params2name.values():
-            start_idx, end_idx, group_idx, group_with_rank = name2indices[name]
-            if group_with_rank != partition_id and isinstance(group_with_rank, int):
-                continue
-            fp32_param = fp32_partitioned_groups_flat[group_idx][start_idx: end_idx]
-            fp32_param.grad = fp32_partitioned_groups_flat_grad[group_idx][start_idx: end_idx]
-            param2name[fp32_param] = name
-            if not torch_opt.state:
-                continue
-            state_param = list(torch_opt.state.values())[group_idx]
-            exp_avg = state_param.get("exp_avg", None)
-            exp_avg_sq = state_param.get("exp_avg_sq", None)
-            if exp_avg is None or exp_avg_sq is None:
-                logger.warning(f"exp_avg or exp_avg_sq of {name} is None, maybe something wrong happened.")
-                continue
-            exp_avg = exp_avg[start_idx: end_idx]
-            exp_avg_sq = exp_avg_sq[start_idx: end_idx]
-            if monitor.mv_distribution:
-                exp_avg_dict[name] = exp_avg
-                exp_avg_sq_dict[name] = exp_avg_sq
-            if monitor.mg_direction:
-                exp_avg_dict[name] = exp_avg
-            if monitor.ur_distribution:
-                if 'step' in state_param:
-                    step = state_param['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
-                elif 'step' in torch_opt.param_groups[group_idx]:
-                    step = torch_opt.param_groups[group_idx]['step']  # AdamW from mindspeed
-                else:
-                    logger.warning(f"step of {name} is None, maybe something wrong happened.")
-                    continue
-                exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
-                exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
-                update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
-                ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
-                monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
-                monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
-        del fp32_partitioned_groups_flat_grad
-        return MVGradResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict,
-                            grad=param2name)
+    def _get_single_state(self, torch_opt):
+        state = {}
+        if hasattr(torch_opt, 'param_to_cpu_states_map'):
+            state = torch_opt.param_to_cpu_states_map
+        elif hasattr(torch_opt, 'state'):
+            state = torch_opt.state
+        elif hasattr(torch_opt, 'optimizer') and hasattr(torch_opt.optimizer, 'state'):
+            state = torch_opt.optimizer.state
+        self.state.update(state)
 class MixPrecisionOptimizerMon(OptimizerMon):
@@ -142,21 +136,14 @@ class MixPrecisionOptimizerMon(OptimizerMon):
     混合精度优化器监控类。在混合精度训练中监控和管理优化器。
     混合精度训练通过适当降低某些计算的精度来加速训练过程并减少内存消耗。
     """
-    def map_fp16_tp_fp32_param(self, torch_opt):
+    def map_fp16_to_fp32_param(self, torch_opt):
         for fp16_group, fp32_group in zip(torch_opt.float16_groups, torch_opt.fp32_from_float16_groups):
             for fp16_param, fp32_param in zip(fp16_group, fp32_group):
                 self.fp16_to_fp32_param[fp16_param] = fp32_param
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            self.map_fp16_tp_fp32_param(torch_opt)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class MegatronDistributedOptimizerMon(OptimizerMon):
-    def map_fp16_tp_fp32_param(self, torch_opt):
+    def map_fp16_to_fp32_param(self, torch_opt):
         if not (hasattr(torch_opt, "model_float16_groups") and
                 hasattr(torch_opt, "shard_fp32_from_float16_groups")):
             raise Exception(
@@ -167,192 +154,176 @@ class MegatronDistributedOptimizerMon(OptimizerMon):
             for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group):
                 self.fp16_to_fp32_param[fp16_param] = shard_fp32_param
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            self.map_fp16_tp_fp32_param(torch_opt)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
-class MegatronFP32OptimizerMon(OptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
 class MegatronChainedDistributedOptimizerMon(MegatronDistributedOptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            for opt in torch_opt.chained_optimizers:
-                self.map_fp16_tp_fp32_param(opt)
-        if not isinstance(torch_opt, torch.optim.Optimizer) and not hasattr(torch_opt, 'state'):
-            torch_opt.state = {}
-            for opt in torch_opt.chained_optimizers:
-                torch_opt.state.update(opt.optimizer.state)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+    def map_fp16_to_fp32_param(self, torch_opt):
+        for opt in torch_opt.chained_optimizers:
+            super().map_fp16_to_fp32_param(opt)
 class MegatronChainedMixPrecisionOptimizerMon(MixPrecisionOptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        if not self.fp16_to_fp32_param and torch_opt is not None:
-            for opt in torch_opt.chained_optimizers:
-                self.map_fp16_tp_fp32_param(opt)
+    def map_fp16_to_fp32_param(self, torch_opt):
+        for opt in torch_opt.chained_optimizers:
+            super().map_fp16_to_fp32_param(opt)
-        if not isinstance(torch_opt, torch.optim.Optimizer) and not hasattr(torch_opt, 'state'):
-            torch_opt.state = {}
-            for opt in torch_opt.chained_optimizers:
-                torch_opt.state.update(opt.optimizer.state)
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
-class DeepSpeedZeroOptimizerStage0Mon(OptimizerMon):
-    def get_group_index(self, torch_opt):
-        bit16_groups = torch_opt.bf16_groups
-        param2group = defaultdict()
-        for group_idx, bit16_group in enumerate(bit16_groups):
+class DeepSpeedZeroOptimizerMon(OptimizerMon):
+    """
+    Base monitor class for DeepSpeed ZeRO optimizer.
+    ZeRO stage 0 no partition
+    ZeRO stage 1 partitions optimizer states across data parallel processes.
+    ZeRO stage 2 additionally partitions gradients.
+    ZeRO stage 3 additionally partitions parameters.
+    This class provides monitoring capabilities for ZeRO optimizers by:
+    - Handling gradient collection for different ZeRO stages
+    - Managing optimizer state access for monitoring
+    """
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = ''
+        self.bit16_groups = []
+        self.fp32_flat_groups = []
+        self.param2group = ()
+        self.param2index = []
+        self.group_offset = {}
+    @abstractmethod
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        raise NotImplementedError
+    def param_not_in_partition(self, lp_param, group_idx):
+        param_slice_mapping = self.torch_opt.state_dict()['param_slice_mappings'][group_idx]
+        hp_address = param_slice_mapping.get(self.torch_opt.param_names.get(lp_param))
+        return hp_address is None
+    def get_position(self, lp_param, group_idx):
+        param_slice_mapping = self.torch_opt.state_dict()['param_slice_mappings'][group_idx]
+        hp_address = param_slice_mapping.get(self.torch_opt.param_names.get(lp_param))
+        return hp_address.start, hp_address.numel
+    def get_group_index(self):
+        param2group = {}
+        for group_idx, bit16_group in enumerate(self.bit16_groups):
             for param in bit16_group:
                 param2group[param] = group_idx
         return param2group
-    def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        param2group = self.get_group_index(torch_opt)
-        exp_avg_dict = defaultdict(float)
-        exp_avg_sq_dict = defaultdict(float)
-        update_dict = defaultdict()
-        ratio_dict = defaultdict()
-        param_slice_mappings = torch_opt.state_dict()['param_slice_mappings']
-        for param, name in params2name.items():
-            group_idx = param2group[param]
-            state = torch_opt.optimizer.state[torch_opt.fp32_groups_flat_partition[group_idx]]
-            if state.get('exp_avg', None) is None:
-                logger.warning(f"optimizer state is None. Something is wrong if this is not the first step")
-                break
-            param_slice_mapping = param_slice_mappings[group_idx]
-            hp_address = param_slice_mapping.get(torch_opt.param_names[param])
-            if hp_address is None:
+    def get_param_index(self, lp_param, group_idx):
+        if not self.param2index:
+            for group in self.bit16_groups:
+                param2index = {}
+                for index, param in enumerate(group):
+                    param2index[param] = index
+                self.param2index.append(param2index)
+        return self.param2index[group_idx][lp_param]
+    def narrow_from_flatten(self, param, flatten_state):
+        if flatten_state is None:
+            return flatten_state
+        group_idx = self.param2group[param]
+        if self.param_not_in_partition(param, group_idx):
+            return None
+        start, numel = self.get_position(param, group_idx)
+        return flatten_state.narrow(0, start, numel)
+    def map_fp16_to_fp32_param(self, torch_opt):
+        for group_idx, group in enumerate(self.bit16_groups):
+            for param in group:
+                self.fp16_to_fp32_param[param] = self.fp32_flat_groups[group_idx]
+    def fetch_grad(self, monitor, params2name):
+        grad_dict = {}
+        for lp_param, name in params2name.items():
+            group_idx = self.param2group[lp_param]
+            param_id = self.get_param_index(lp_param, group_idx)
+            if self.param_not_in_partition(lp_param, group_idx):
                 continue
-            start = hp_address.start
-            numel = hp_address.numel
-            if monitor.mv_distribution:
-                exp_avg_dict[name] = state['exp_avg'].narrow(0, start, numel)
-                exp_avg_sq_dict[name] = state['exp_avg_sq'].narrow(0, start, numel)
-            if monitor.mg_direction:
-                exp_avg_dict[name] = state['exp'].narrow(0, start, numel)
-            if monitor.ur_distribution:
-                if len(torch_opt.param_groups) > 1:
-                    logger.info(f"the length of torch_opt.param_groups is {len(torch_opt.param_groups)}.")
-                if 'step' in state:
-                    step = state['step']  # Optimizer from pytorch or FusedAdam from apex(used by megatron)
-                elif 'step' in torch_opt.param_groups[0]:
-                    step = torch_opt.param_groups[0]['step']  # AdamW from mindspeed
+            if self.stage == '1or2':
+                param_id = param_id - self.group_offset[group_idx] - 1
+            grad = self.get_grad_for_param(lp_param, group_idx, param_id)
+            tag = monitor.name2tag.get(name, {}).get(MonitorConst.POST_GRAD)
+            monitor.register_param_call_id("hook_optimizer", tag)
+            grad_dict[tag] = grad
+        return grad_dict
+class DeepSpeedZeroOptimizerStage0Mon(DeepSpeedZeroOptimizerMon):
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = '0'
+        self.bit16_groups = torch_opt.bf16_groups
+        self.fp32_flat_groups = torch_opt.fp32_groups_flat_partition
+        self.param2group = self.get_group_index()
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        return self.torch_opt.fp32_groups_gradient_dict[group_idx][param_id]
+class DeepSpeedZeroOptimizerStage1or2Mon(DeepSpeedZeroOptimizerMon):
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = '1or2'
+        self.bit16_groups = torch_opt.bit16_groups
+        self.fp32_flat_groups = torch_opt.single_partition_of_fp32_groups
+        self.param2group = self.get_group_index()
+        self.group_offset = {}
+        self.get_group_offset()
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        if getattr(self.torch_opt, "cpu_offload", False):
+            grads = self.torch_opt.single_partition_of_fp32_groups[group_idx].grad
+            start, numel = self.get_position(lp_param, group_idx)
+            grad = grads.narrow(0, start, numel)
+        else:
+            grad = self.torch_opt.averaged_gradients[group_idx][param_id]
+        return grad
+    def get_group_offset(self):
+        for group_idx, group in enumerate(self.bit16_groups):
+            self.group_offset[group_idx] = -1
+            for lp_param in group:
+                if self.param_not_in_partition(lp_param, group_idx):
+                    self.group_offset[group_idx] = self.get_param_index(lp_param, group_idx)
                 else:
-                    logger.warning(f"step of {name} is None, maybe something wrong happened.")
-                    continue
-                exp_avg = state['exp_avg'].narrow(0, start, numel)
-                exp_avg_sq = state['exp_avg_sq'].narrow(0, start, numel)
-                exp_avg_hat = exp_avg / (1 - torch_opt.defaults['betas'][0] ** step)
-                exp_avg_sq_hat = exp_avg_sq / (1 - torch_opt.defaults['betas'][1] ** step)
-                update_dict[name] = exp_avg_hat / (torch.sqrt(exp_avg_sq_hat) + torch_opt.defaults['eps'])
-                ratio_dict[name] = exp_avg_hat / torch.sqrt(exp_avg_sq_hat)
-                monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name])
-                monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name])
-        return MVResult(exp_avg=exp_avg_dict, exp_avg_sq=exp_avg_sq_dict, update=update_dict, ratio=ratio_dict)
+                    break
-class DeepSpeedZeroOptimizerStage3Mon(OptimizerMon):
-    def get_param_index(self, params2name, name2index, torch_opt):
-        fp16_groups = torch_opt.fp16_partitioned_groups
-        name2indices = defaultdict()
-        index_length = defaultdict()
-        index = 0
-        idx = 0
-        for group_idx, fp16_group in enumerate(fp16_groups):
-            for param in fp16_group:
-                param_length = len(param.flatten())
-                index_length[idx] = (index, index + param_length, group_idx)
-                index += param_length
-                idx += 1
-        for _, name in params2name.items():
-            idx = name2index[name]
-            start_idx, end_idx, group_idx = index_length[idx]
-            name2indices[name] = (start_idx, end_idx, group_idx, None)
-        return name2indices
-    def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        self.is_stage3 = True
-        fp32_partitioned_groups_flat = torch_opt.fp32_partitioned_groups_flat
-        return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
-class DeepSpeedZeroOptimizerStage1or2Mon(OptimizerMon):
-    @staticmethod
-    def get_group_index(fp32_length, world_size, index):
-        for i in range(len(fp32_length) - 1):
-            if fp32_length[i] <= index < fp32_length[i + 1]:
-                interval_start = fp32_length[i]
-                interval_length = fp32_length[i + 1] - fp32_length[i]
-                sub_interval_length = interval_length // world_size
-                sub_index = (index - interval_start) // sub_interval_length
-                sub_interval_start = interval_start + sub_index * sub_interval_length
-                return sub_interval_start, min(sub_index, world_size - 1)
-        return fp32_length[-1], 0
-    def get_param_index(self, params2name, name2index, torch_opt):
-        padding = torch_opt.groups_padding
-        world_size = dist.get_world_size()
-        fp32_length = [0]
-        for fp32_group_index, single_partition_of_fp32_group in enumerate(torch_opt.single_partition_of_fp32_groups):
-            fp32_length.append(len(single_partition_of_fp32_group) * world_size + fp32_length[fp32_group_index])
-        bf16_groups = []
-        name2indices = defaultdict()
-        index_length = defaultdict()
-        index = 0
-        idx = 0
-        for group_idx, bf16_group in enumerate(torch_opt.bit16_groups):
-            bf16_groups.extend(bf16_group)
-            for param in bf16_group:
-                param_length = len(param.flatten())
-                group_index, group_with_rank = self.get_group_index(fp32_length, world_size, index)
-                index_length[idx] = (index, index + param_length, group_idx, group_index, group_with_rank)
-                index += param_length
-                idx += 1
-        group_length = len(bf16_groups) / len(torch_opt.bit16_groups)
-        for _, name in params2name.items():
-            name_index = name2index[name]
-            start_idx, end_idx, group_idx, group_index, group_with_rank = index_length[name_index]
-            need_padding = True if group_with_rank == world_size - 1 else False
-            new_start_idx = start_idx - group_index
-            new_end_idx = end_idx - group_index
-            if need_padding and group_length - 1 <= name_index <= len(bf16_groups) - 1 and name_index % (
-                    group_length - 1) == 0:
-                new_end_idx -= padding[int(name_index // (group_length - 1) - 1)]
-            name2indices[name] = (new_start_idx, new_end_idx, group_idx, group_with_rank)
-        return name2indices
-    def fetch_mv(self, monitor, torch_opt, params2name, name2indices=None):
-        fp32_partitioned_groups_flat = torch_opt.single_partition_of_fp32_groups
-        return self._fetch_mv_grad_in_adam(monitor, torch_opt, params2name, name2indices, fp32_partitioned_groups_flat)
-class DummyOptimizerMon(OptimizerMon):
-    def fetch_mv(self, monitor, torch_opt, params2name):
-        return self._fetch_mv_in_adam(monitor, torch_opt, params2name)
+class DeepSpeedZeroOptimizerStage3Mon(DeepSpeedZeroOptimizerMon):
+    def __init__(self, torch_opt):
+        super().__init__(torch_opt)
+        self.stage = '3'
+        self.bit16_groups = torch_opt.fp16_groups
+        self.fp32_flat_groups = torch_opt.fp32_partitioned_groups_flat
+        self.param2group = self.get_group_index()
+    def param_not_in_partition(self, lp_param, group_idx):
+        """Each param partioned across all zero ranks"""
+        return False
+    def get_position(self, lp_param, group_idx):
+        param_id = self.torch_opt.get_param_id(lp_param)
+        return self.torch_opt.grad_position[param_id][1:]
+    def get_grad_for_param(self, lp_param, group_idx, param_id):
+        return self.torch_opt.averaged_gradients[group_idx][param_id]
 class OptimizerMonFactory:
     _optimizer_mon_map = {
-        "FP32Optimizer": MegatronFP32OptimizerMon,
+        "FP32Optimizer": OptimizerMon,
         "Float16OptimizerWithFloat16Params": MixPrecisionOptimizerMon,
         "DistributedOptimizer": MegatronDistributedOptimizerMon,
+        "SwapDistributedOptimizer": MegatronDistributedOptimizerMon,
         "ChainedDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
+        "ChainedSwapDistributedOptimizer": MegatronChainedDistributedOptimizerMon,
         "ChainedFloat16OptimizerWithFloat16Params": MegatronChainedMixPrecisionOptimizerMon,
         "BF16_Optimizer": DeepSpeedZeroOptimizerStage0Mon,
         "DeepSpeedZeroOptimizer": DeepSpeedZeroOptimizerStage1or2Mon,
         "DeepSpeedZeroOptimizer_Stage3": DeepSpeedZeroOptimizerStage3Mon,
-        "Adam": DummyOptimizerMon
+        "Adam": OptimizerMon
     }
     @staticmethod
@@ -361,6 +332,7 @@ class OptimizerMonFactory:
         optimizer_class = optimizer.__class__.__name__
         if optimizer_class == "ChainedOptimizer":
             optimizer_class = "Chained" + optimizer.chained_optimizers[0].__class__.__name__
+        logger.info(f'The optimizer type is {optimizer_class}')
-        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(optimizer_class, DummyOptimizerMon)
-        return optimizer_mon_class(), optimizer_class
+        optimizer_mon_class = OptimizerMonFactory._optimizer_mon_map.get(optimizer_class, OptimizerMon)
+        return optimizer_mon_class(optimizer)

mindstudio-probe 1.3.0__py3-none-any.whl → 8.1.1__py3-none-any.whl

mindstudio-probe 1.3.0py3-none-any.whl → 8.1.1py3-none-any.whl