mlperf-logging 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlperf_logging/__init__.py +0 -0
- mlperf_logging/benchmark_meta.py +184 -0
- mlperf_logging/compliance_checker/__init__.py +0 -0
- mlperf_logging/compliance_checker/__main__.py +32 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_common.yaml +24 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_cosmoflow.yaml +47 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam.yaml +80 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_cosine_annealing.yaml +10 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_lamb.yaml +16 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_multistep.yaml +10 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/closed_oc20.yaml +39 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/common.yaml +156 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/open_common.yaml +5 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/open_cosmoflow.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/open_deepcam.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_1.0.0/open_oc20.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_common.yaml +24 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_cosmoflow.yaml +47 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam.yaml +80 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam_cosine_annealing.yaml +10 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam_lamb.yaml +16 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam_multistep.yaml +10 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/closed_oc20.yaml +39 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/common.yaml +156 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/open_common.yaml +5 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/open_cosmoflow.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/open_deepcam.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_2.0.0/open_oc20.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_common.yaml +24 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_cosmoflow.yaml +47 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam.yaml +80 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam_cosine_annealing.yaml +10 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam_lamb.yaml +16 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam_multistep.yaml +10 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_oc20.yaml +39 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/closed_openfold.yaml +196 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/common.yaml +154 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/open_common.yaml +5 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/open_cosmoflow.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/open_deepcam.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/open_oc20.yaml +6 -0
- mlperf_logging/compliance_checker/hpc_3.0.0/open_openfold.yaml +6 -0
- mlperf_logging/compliance_checker/mlp_compliance.py +345 -0
- mlperf_logging/compliance_checker/mlp_parser/__init__.py +35 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_060.py +111 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_070.py +104 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_100.py +104 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_110.py +104 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_200.py +104 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_210.py +104 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_300.py +104 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_310.py +105 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py +105 -0
- mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py +105 -0
- mlperf_logging/compliance_checker/training_0.6.0/common.yaml +135 -0
- mlperf_logging/compliance_checker/training_0.6.0/gnmt.yaml +51 -0
- mlperf_logging/compliance_checker/training_0.6.0/maskrcnn.yaml +27 -0
- mlperf_logging/compliance_checker/training_0.6.0/minigo.yaml +147 -0
- mlperf_logging/compliance_checker/training_0.6.0/resnet.yaml +59 -0
- mlperf_logging/compliance_checker/training_0.6.0/score.yaml +20 -0
- mlperf_logging/compliance_checker/training_0.6.0/ssd.yaml +36 -0
- mlperf_logging/compliance_checker/training_0.6.0/transformer.yaml +41 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_bert.yaml +49 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_common.yaml +6 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_dlrm.yaml +36 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_gnmt.yaml +47 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_maskrcnn.yaml +57 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_minigo.yaml +43 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_resnet.yaml +18 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_ssd.yaml +45 -0
- mlperf_logging/compliance_checker/training_0.7.0/closed_transformer.yaml +50 -0
- mlperf_logging/compliance_checker/training_0.7.0/common.yaml +154 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_common.yaml +6 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_dlrm.yaml +7 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_gnmt.yaml +8 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_0.7.0/open_transformer.yaml +7 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/bert.yaml +50 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/common.yaml +210 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/dlrm.yaml +17 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/gnmt.yaml +52 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/maskrcnn.yaml +56 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/minigo.yaml +85 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/resnet.yaml +26 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/resnet_lars.yaml +27 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/resnet_sgd.yaml +6 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/ssd.yaml +41 -0
- mlperf_logging/compliance_checker/training_0.7.0_warn/transformer.yaml +44 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_bert.yaml +49 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_dlrm.yaml +36 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_maskrcnn.yaml +57 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_minigo.yaml +43 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_resnet.yaml +18 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_rnnt.yaml +159 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_ssd.yaml +45 -0
- mlperf_logging/compliance_checker/training_1.0.0/closed_unet3d.yaml +73 -0
- mlperf_logging/compliance_checker/training_1.0.0/common.yaml +149 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_common.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_dlrm.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_rnnt.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.0.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_bert.yaml +463 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_dlrm.yaml +61 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_maskrcnn.yaml +94 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_minigo.yaml +108 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_resnet.yaml +195 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_rnnt.yaml +159 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_ssd.yaml +181 -0
- mlperf_logging/compliance_checker/training_1.1.0/closed_unet3d.yaml +140 -0
- mlperf_logging/compliance_checker/training_1.1.0/common.yaml +149 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_common.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_dlrm.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_rnnt.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_1.1.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_bert.yaml +463 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_dlrm.yaml +61 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_maskrcnn.yaml +96 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_minigo.yaml +108 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_resnet.yaml +195 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_rnnt.yaml +159 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_ssd.yaml +141 -0
- mlperf_logging/compliance_checker/training_2.0.0/closed_unet3d.yaml +140 -0
- mlperf_logging/compliance_checker/training_2.0.0/common.yaml +150 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_common.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_dlrm.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_rnnt.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.0.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_bert.yaml +463 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_dlrm.yaml +61 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_maskrcnn.yaml +96 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_minigo.yaml +108 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_resnet.yaml +195 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_rnnt.yaml +159 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_ssd.yaml +141 -0
- mlperf_logging/compliance_checker/training_2.1.0/closed_unet3d.yaml +140 -0
- mlperf_logging/compliance_checker/training_2.1.0/common.yaml +150 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_common.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_dlrm.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_rnnt.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_2.1.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_bert.yaml +48 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_dlrm.yaml +35 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_dlrm_dcnv2.yaml +59 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_gpt3.yaml +79 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_maskrcnn.yaml +57 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_minigo.yaml +43 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_resnet.yaml +17 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_rnnt.yaml +138 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_ssd.yaml +35 -0
- mlperf_logging/compliance_checker/training_3.0.0/closed_unet3d.yaml +73 -0
- mlperf_logging/compliance_checker/training_3.0.0/common.yaml +150 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_common.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_dlrm_dcnv2.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_rnnt.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.0.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_bert.yaml +48 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_dlrm.yaml +35 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_dlrm_dcnv2.yaml +59 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_gpt3.yaml +79 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_maskrcnn.yaml +57 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_minigo.yaml +43 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_resnet.yaml +17 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_rnnt.yaml +138 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_ssd.yaml +35 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_stable_diffusion.yaml +74 -0
- mlperf_logging/compliance_checker/training_3.1.0/closed_unet3d.yaml +73 -0
- mlperf_logging/compliance_checker/training_3.1.0/common.yaml +151 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_common.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_dlrm_dcnv2.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_maskrcnn.yaml +12 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_minigo.yaml +9 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_rnnt.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_stable_diffusion.yaml +33 -0
- mlperf_logging/compliance_checker/training_3.1.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_bert.yaml +48 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_dlrm_dcnv2.yaml +59 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml +21 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_gpt3.yaml +79 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_llama2_70b_lora.yaml +45 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_resnet.yaml +17 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_lars.yaml +37 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_sgd.yaml +36 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_ssd.yaml +35 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_stable_diffusion.yaml +74 -0
- mlperf_logging/compliance_checker/training_4.0.0/closed_unet3d.yaml +73 -0
- mlperf_logging/compliance_checker/training_4.0.0/common.yaml +151 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml +6 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_dlrm_dcnv2.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_gpt3.yaml +79 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_llama2_70b_lora.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_resnet.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_stable_diffusion.yaml +33 -0
- mlperf_logging/compliance_checker/training_4.0.0/open_unet3d.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml +48 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml +11 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml +59 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml +21 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_gpt3.yaml +79 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml +42 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_ssd.yaml +35 -0
- mlperf_logging/compliance_checker/training_4.1.0/closed_stable_diffusion.yaml +74 -0
- mlperf_logging/compliance_checker/training_4.1.0/common.yaml +146 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_bert.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_common.yaml +6 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_dlrm_dcnv2.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_gnn.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_gpt3.yaml +79 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_llama2_70b_lora.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_ssd.yaml +7 -0
- mlperf_logging/compliance_checker/training_4.1.0/open_stable_diffusion.yaml +33 -0
- mlperf_logging/mllog/__init__.py +97 -0
- mlperf_logging/mllog/constants.py +190 -0
- mlperf_logging/mllog/examples/__init__.py +14 -0
- mlperf_logging/mllog/examples/dummy_example.py +87 -0
- mlperf_logging/mllog/examples/linear_regression_example.py +311 -0
- mlperf_logging/mllog/examples/power/__init__.py +14 -0
- mlperf_logging/mllog/examples/power/compute_metric_example.py +33 -0
- mlperf_logging/mllog/examples/power/power_measurement.py +194 -0
- mlperf_logging/mllog/examples/power/reader.py +73 -0
- mlperf_logging/mllog/mllog.py +261 -0
- mlperf_logging/mllog/test_mllog.py +133 -0
- mlperf_logging/package_checker/__init__.py +0 -0
- mlperf_logging/package_checker/__main__.py +3 -0
- mlperf_logging/package_checker/package_checker.py +337 -0
- mlperf_logging/package_checker/seed_checker.py +148 -0
- mlperf_logging/rcp_checker/__init__.py +0 -0
- mlperf_logging/rcp_checker/__main__.py +3 -0
- mlperf_logging/rcp_checker/hpc_1.0.0/rcps_cosmoflow.json +66 -0
- mlperf_logging/rcp_checker/hpc_1.0.0/rcps_deepcam.json +117 -0
- mlperf_logging/rcp_checker/hpc_1.0.0/rcps_oc20.json +41 -0
- mlperf_logging/rcp_checker/hpc_2.0.0/rcps_cosmoflow.json +82 -0
- mlperf_logging/rcp_checker/hpc_2.0.0/rcps_deepcam.json +119 -0
- mlperf_logging/rcp_checker/hpc_2.0.0/rcps_oc20.json +41 -0
- mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json +82 -0
- mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json +119 -0
- mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json +41 -0
- mlperf_logging/rcp_checker/hpc_3.0.0/rcps_openfold.json +52 -0
- mlperf_logging/rcp_checker/rcp_checker.py +576 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_bert.json +163 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_dlrm.json +59 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_maskrcnn.json +113 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_resnet.json +192 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_rnnt.json +75 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_ssd.json +63 -0
- mlperf_logging/rcp_checker/training_1.0.0/rcps_unet3d.json +131 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_bert.json +256 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_dlrm.json +65 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_maskrcnn.json +113 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_resnet.json +197 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_rnnt.json +115 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_ssd.json +93 -0
- mlperf_logging/rcp_checker/training_1.1.0/rcps_unet3d.json +146 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_bert.json +302 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_dlrm.json +65 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_maskrcnn.json +131 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_resnet.json +221 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_rnnt.json +168 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_ssd.json +57 -0
- mlperf_logging/rcp_checker/training_2.0.0/rcps_unet3d.json +146 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_bert.json +303 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_dlrm.json +65 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_maskrcnn.json +131 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_resnet.json +221 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_rnnt.json +168 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_ssd.json +92 -0
- mlperf_logging/rcp_checker/training_2.1.0/rcps_unet3d.json +146 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_bert.json +303 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_dlrm_dcnv2.json +133 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_gpt3.json +78 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_maskrcnn.json +131 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_resnet.json +221 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_rnnt.json +168 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_ssd.json +127 -0
- mlperf_logging/rcp_checker/training_3.0.0/rcps_unet3d.json +182 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_bert.json +303 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_dlrm_dcnv2.json +133 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_gpt3.json +78 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_maskrcnn.json +131 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_resnet.json +221 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_rnnt.json +168 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_ssd.json +145 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_stable_diffusion.json +66 -0
- mlperf_logging/rcp_checker/training_3.1.0/rcps_unet3d.json +178 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_bert.json +303 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_dlrm_dcnv2.json +133 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json +90 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_gpt3.json +78 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json +91 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_resnet.json +221 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json +163 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json +88 -0
- mlperf_logging/rcp_checker/training_4.0.0/rcps_unet3d.json +132 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_bert.json +303 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_dlrm_dcnv2.json +133 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_gnn.json +90 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_gpt3.json +93 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_llama2_70b_lora.json +91 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_ssd.json +163 -0
- mlperf_logging/rcp_checker/training_4.1.0/rcps_stable_diffusion.json +88 -0
- mlperf_logging/repo_checker/__init__.py +0 -0
- mlperf_logging/repo_checker/__main__.py +3 -0
- mlperf_logging/repo_checker/repo_checker.py +155 -0
- mlperf_logging/result_summarizer/__init__.py +0 -0
- mlperf_logging/result_summarizer/__main__.py +3 -0
- mlperf_logging/result_summarizer/result_summarizer.py +977 -0
- mlperf_logging/system_desc_checker/__init__.py +0 -0
- mlperf_logging/system_desc_checker/__main__.py +3 -0
- mlperf_logging/system_desc_checker/system_desc_checker.py +179 -0
- mlperf_logging-3.0.0.dist-info/LICENSE.md +177 -0
- mlperf_logging-3.0.0.dist-info/METADATA +65 -0
- mlperf_logging-3.0.0.dist-info/RECORD +369 -0
- mlperf_logging-3.0.0.dist-info/WHEEL +5 -0
- mlperf_logging-3.0.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# benchmark dictionary
|
|
2
|
+
_ALL_RESULT_FILE_COUNTS = {
|
|
3
|
+
'training': {
|
|
4
|
+
'bert': 10,
|
|
5
|
+
'dlrm': 5,
|
|
6
|
+
'dlrm_dcnv2': 10,
|
|
7
|
+
'gnmt': 10,
|
|
8
|
+
'gpt3': 3,
|
|
9
|
+
'maskrcnn': 5,
|
|
10
|
+
'minigo': 10,
|
|
11
|
+
'resnet': 5,
|
|
12
|
+
'ssd': 5,
|
|
13
|
+
'stable_diffusion': 10,
|
|
14
|
+
'transformer': 10,
|
|
15
|
+
'ncf': 10,
|
|
16
|
+
'rnnt': 10,
|
|
17
|
+
'unet3d': 40,
|
|
18
|
+
'gnn' : 10,
|
|
19
|
+
'llama2_70b_lora': 10,
|
|
20
|
+
},
|
|
21
|
+
|
|
22
|
+
'hpc' : {
|
|
23
|
+
'deepcam': 5,
|
|
24
|
+
'cosmoflow': 10,
|
|
25
|
+
'oc20': 5,
|
|
26
|
+
'openfold': 10,
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_ALL_ALLOWED_BENCHMARKS = {
|
|
32
|
+
'training': {
|
|
33
|
+
'0.6': [
|
|
34
|
+
'resnet',
|
|
35
|
+
'ssd',
|
|
36
|
+
'maskrcnn',
|
|
37
|
+
'gnmt',
|
|
38
|
+
'transformer',
|
|
39
|
+
'ncf',
|
|
40
|
+
'minigo',
|
|
41
|
+
],
|
|
42
|
+
|
|
43
|
+
'0.7': [
|
|
44
|
+
'bert',
|
|
45
|
+
'dlrm',
|
|
46
|
+
'gnmt',
|
|
47
|
+
'maskrcnn',
|
|
48
|
+
'minigo',
|
|
49
|
+
'resnet',
|
|
50
|
+
'ssd',
|
|
51
|
+
'transformer'
|
|
52
|
+
],
|
|
53
|
+
'1.0': [
|
|
54
|
+
'bert',
|
|
55
|
+
'dlrm',
|
|
56
|
+
'maskrcnn',
|
|
57
|
+
'minigo',
|
|
58
|
+
'resnet',
|
|
59
|
+
'ssd',
|
|
60
|
+
'rnnt',
|
|
61
|
+
'unet3d',
|
|
62
|
+
],
|
|
63
|
+
'1.1': [
|
|
64
|
+
'bert',
|
|
65
|
+
'dlrm',
|
|
66
|
+
'maskrcnn',
|
|
67
|
+
'minigo',
|
|
68
|
+
'resnet',
|
|
69
|
+
'ssd',
|
|
70
|
+
'rnnt',
|
|
71
|
+
'unet3d',
|
|
72
|
+
],
|
|
73
|
+
'2.0': [
|
|
74
|
+
'bert',
|
|
75
|
+
'dlrm',
|
|
76
|
+
'maskrcnn',
|
|
77
|
+
'minigo',
|
|
78
|
+
'resnet',
|
|
79
|
+
'ssd',
|
|
80
|
+
'rnnt',
|
|
81
|
+
'unet3d',
|
|
82
|
+
],
|
|
83
|
+
'2.1': [
|
|
84
|
+
'bert',
|
|
85
|
+
'dlrm',
|
|
86
|
+
'maskrcnn',
|
|
87
|
+
'minigo',
|
|
88
|
+
'resnet',
|
|
89
|
+
'ssd',
|
|
90
|
+
'rnnt',
|
|
91
|
+
'unet3d',
|
|
92
|
+
],
|
|
93
|
+
'3.0': [
|
|
94
|
+
'bert',
|
|
95
|
+
'dlrm_dcnv2',
|
|
96
|
+
'gpt3',
|
|
97
|
+
'maskrcnn',
|
|
98
|
+
'resnet',
|
|
99
|
+
'ssd',
|
|
100
|
+
'rnnt',
|
|
101
|
+
'unet3d',
|
|
102
|
+
],
|
|
103
|
+
'3.1': [
|
|
104
|
+
'bert',
|
|
105
|
+
'dlrm_dcnv2',
|
|
106
|
+
'gpt3',
|
|
107
|
+
'maskrcnn',
|
|
108
|
+
'resnet',
|
|
109
|
+
'ssd',
|
|
110
|
+
'rnnt',
|
|
111
|
+
'unet3d',
|
|
112
|
+
'stable_diffusion'
|
|
113
|
+
],
|
|
114
|
+
'4.0': [
|
|
115
|
+
'bert',
|
|
116
|
+
'dlrm_dcnv2',
|
|
117
|
+
'gpt3',
|
|
118
|
+
'resnet',
|
|
119
|
+
'ssd',
|
|
120
|
+
'unet3d',
|
|
121
|
+
'stable_diffusion',
|
|
122
|
+
'llama2_70b_lora',
|
|
123
|
+
'stable_diffusion',
|
|
124
|
+
'gnn'
|
|
125
|
+
],
|
|
126
|
+
'4.1': [
|
|
127
|
+
'bert',
|
|
128
|
+
'dlrm_dcnv2',
|
|
129
|
+
'gpt3',
|
|
130
|
+
'ssd',
|
|
131
|
+
'stable_diffusion',
|
|
132
|
+
'llama2_70b_lora',
|
|
133
|
+
'gnn'
|
|
134
|
+
]
|
|
135
|
+
},
|
|
136
|
+
|
|
137
|
+
'hpc': {
|
|
138
|
+
'0.7': [
|
|
139
|
+
'cosmoflow',
|
|
140
|
+
'deepcam',
|
|
141
|
+
],
|
|
142
|
+
|
|
143
|
+
'1.0': [
|
|
144
|
+
'cosmoflow',
|
|
145
|
+
'deepcam',
|
|
146
|
+
'oc20',
|
|
147
|
+
],
|
|
148
|
+
'2.0': [
|
|
149
|
+
'cosmoflow',
|
|
150
|
+
'deepcam',
|
|
151
|
+
'oc20',
|
|
152
|
+
],
|
|
153
|
+
'3.0': [
|
|
154
|
+
'cosmoflow',
|
|
155
|
+
'deepcam',
|
|
156
|
+
'oc20',
|
|
157
|
+
'openfold',
|
|
158
|
+
],
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def get_allowed_benchmarks(usage, ruleset):
|
|
164
|
+
# check usage
|
|
165
|
+
if usage not in _ALL_ALLOWED_BENCHMARKS:
|
|
166
|
+
raise ValueError('usage {} not supported!'.format(usage))
|
|
167
|
+
|
|
168
|
+
# check ruleset
|
|
169
|
+
if ruleset not in _ALL_ALLOWED_BENCHMARKS[usage]:
|
|
170
|
+
# try short version:
|
|
171
|
+
ruleset_short = ".".join(ruleset.split(".")[:-1])
|
|
172
|
+
if ruleset_short not in _ALL_ALLOWED_BENCHMARKS[usage]:
|
|
173
|
+
raise ValueError('ruleset {} is not supported in {}'.format(ruleset, usage))
|
|
174
|
+
allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset_short]
|
|
175
|
+
else:
|
|
176
|
+
allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset]
|
|
177
|
+
|
|
178
|
+
return allowed_benchmarks
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_result_file_counts(usage):
|
|
182
|
+
if usage not in _ALL_RESULT_FILE_COUNTS:
|
|
183
|
+
raise ValueError('usage {} not supported!'.format(usage))
|
|
184
|
+
return _ALL_RESULT_FILE_COUNTS[usage]
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from . import mlp_compliance
|
|
5
|
+
|
|
6
|
+
parser = mlp_compliance.get_parser()
|
|
7
|
+
args = parser.parse_args()
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(filename=args.log_output, level=logging.INFO)
|
|
10
|
+
logging.getLogger().addHandler(logging.StreamHandler())
|
|
11
|
+
formatter = logging.Formatter("%(levelname)s - %(message)s")
|
|
12
|
+
logging.getLogger().handlers[0].setFormatter(formatter)
|
|
13
|
+
logging.getLogger().handlers[1].setFormatter(formatter)
|
|
14
|
+
|
|
15
|
+
config_file = args.config or f'{args.usage}_{args.ruleset}/common.yaml'
|
|
16
|
+
|
|
17
|
+
checker = mlp_compliance.make_checker(
|
|
18
|
+
args.usage,
|
|
19
|
+
args.ruleset,
|
|
20
|
+
args.quiet,
|
|
21
|
+
args.werror,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
valid, system_id, benchmark, result = mlp_compliance.main(args.filename, config_file, checker)
|
|
25
|
+
|
|
26
|
+
if not valid:
|
|
27
|
+
logging.error('FAILED')
|
|
28
|
+
print('** Logging output also at', args.log_output)
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
else:
|
|
31
|
+
print('** Logging output also at', args.log_output)
|
|
32
|
+
logging.info('SUCCESS')
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
|
|
2
|
+
- KEY:
|
|
3
|
+
NAME: submission_benchmark
|
|
4
|
+
REQ: EXACTLY_ONE
|
|
5
|
+
CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
|
|
6
|
+
POST: " enqueue_config('hpc_1.0.0/closed_{}.yaml'.format(v['value'])) "
|
|
7
|
+
|
|
8
|
+
- KEY:
|
|
9
|
+
NAME: gradient_accumulation_steps
|
|
10
|
+
CHECK: " v['value'] > 0 "
|
|
11
|
+
|
|
12
|
+
- KEY:
|
|
13
|
+
NAME: gradient_accumulation_frequency
|
|
14
|
+
CHECK: " v['value'] > 0 "
|
|
15
|
+
|
|
16
|
+
- KEY:
|
|
17
|
+
NAME: number_of_nodes
|
|
18
|
+
REQ: EXACTLY_ONE
|
|
19
|
+
CHECK: " v['value'] > 0"
|
|
20
|
+
|
|
21
|
+
- KEY:
|
|
22
|
+
NAME: accelerators_per_node
|
|
23
|
+
REQ: EXACTLY_ONE
|
|
24
|
+
CHECK: " v['value'] >= 0"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
- KEY:
|
|
2
|
+
NAME: global_batch_size
|
|
3
|
+
REQ: EXACTLY_ONE
|
|
4
|
+
CHECK: " v['value'] > 0"
|
|
5
|
+
|
|
6
|
+
- KEY:
|
|
7
|
+
NAME: opt_name
|
|
8
|
+
REQ: EXACTLY_ONE
|
|
9
|
+
CHECK: " v['value'] in ['sgd', 'SGD'] "
|
|
10
|
+
|
|
11
|
+
- KEY:
|
|
12
|
+
NAME: opt_base_learning_rate
|
|
13
|
+
REQ: EXACTLY_ONE
|
|
14
|
+
CHECK: " v['value'] >= 0."
|
|
15
|
+
|
|
16
|
+
- KEY:
|
|
17
|
+
NAME: opt_learning_rate_warmup_epochs
|
|
18
|
+
REQ: EXACTLY_ONE
|
|
19
|
+
CHECK: " v['value'] >= 0"
|
|
20
|
+
|
|
21
|
+
- KEY:
|
|
22
|
+
NAME: opt_learning_rate_warmup_factor
|
|
23
|
+
REQ: EXACTLY_ONE
|
|
24
|
+
CHECK: " v['value'] >= 0."
|
|
25
|
+
|
|
26
|
+
- KEY:
|
|
27
|
+
NAME: opt_learning_rate_decay_boundary_epochs
|
|
28
|
+
REQ: EXACTLY_ONE
|
|
29
|
+
|
|
30
|
+
- KEY:
|
|
31
|
+
NAME: opt_learning_rate_decay_factor
|
|
32
|
+
REQ: EXACTLY_ONE
|
|
33
|
+
|
|
34
|
+
- KEY:
|
|
35
|
+
NAME: dropout
|
|
36
|
+
CHECK: " v['value'] >= 0. and v['value'] < 1."
|
|
37
|
+
|
|
38
|
+
- KEY:
|
|
39
|
+
NAME: opt_weight_decay
|
|
40
|
+
CHECK: " v['value'] >= 0."
|
|
41
|
+
|
|
42
|
+
- KEY:
|
|
43
|
+
NAME: eval_error
|
|
44
|
+
REQ: AT_LEAST_ONE
|
|
45
|
+
CHECK:
|
|
46
|
+
- "'epoch_num' in v['metadata']"
|
|
47
|
+
ATLEAST_ONE_CHECK: "v['value'] <= 0.124 and v['value'] > 0."
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# General Settings
|
|
2
|
+
- KEY:
|
|
3
|
+
NAME: gradient_accumulation_frequency
|
|
4
|
+
REQ: EXACTLY_ONE
|
|
5
|
+
CHECK: " v['value'] > 0 "
|
|
6
|
+
|
|
7
|
+
- KEY:
|
|
8
|
+
NAME: seed
|
|
9
|
+
REQ: EXACTLY_ONE
|
|
10
|
+
CHECK: " v['value'] > 0"
|
|
11
|
+
|
|
12
|
+
- KEY:
|
|
13
|
+
NAME: global_batch_size
|
|
14
|
+
REQ: EXACTLY_ONE
|
|
15
|
+
CHECK: " v['value'] > 0"
|
|
16
|
+
|
|
17
|
+
- KEY:
|
|
18
|
+
NAME: batchnorm_group_size
|
|
19
|
+
REQ: EXACTLY_ONE
|
|
20
|
+
CHECK: " v['value'] > 0"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Optimizer Parameters
|
|
24
|
+
- KEY:
|
|
25
|
+
NAME: opt_name
|
|
26
|
+
REQ: EXACTLY_ONE
|
|
27
|
+
CHECK: " v['value'] in ['Adam', 'AdamW', 'LAMB']"
|
|
28
|
+
POST: " if (v['value'] == 'LAMB'): enqueue_config('hpc_1.0.0/closed_deepcam_lamb.yaml') "
|
|
29
|
+
|
|
30
|
+
- KEY:
|
|
31
|
+
NAME: opt_lr
|
|
32
|
+
REQ: EXACTLY_ONE
|
|
33
|
+
CHECK: " v['value'] >0."
|
|
34
|
+
|
|
35
|
+
- KEY:
|
|
36
|
+
NAME: opt_betas
|
|
37
|
+
REQ: EXACTLY_ONE
|
|
38
|
+
CHECK: " len(v['value']) == 2"
|
|
39
|
+
|
|
40
|
+
- KEY:
|
|
41
|
+
NAME: opt_eps
|
|
42
|
+
REQ: EXACTLY_ONE
|
|
43
|
+
CHECK: " math.isclose(v['value'], 1e-6)"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Scheduler Parameters
|
|
47
|
+
- KEY:
|
|
48
|
+
NAME: scheduler_type
|
|
49
|
+
REQ: EXACTLY_ONE
|
|
50
|
+
CHECK: " v['value'] in ['multistep', 'cosine_annealing']"
|
|
51
|
+
POST: " enqueue_config('hpc_1.0.0/closed_deepcam_{}.yaml'.format(v['value'].lower())) "
|
|
52
|
+
|
|
53
|
+
- KEY:
|
|
54
|
+
NAME: scheduler_lr_warmup_steps
|
|
55
|
+
REQ: EXACTLY_ONE
|
|
56
|
+
CHECK: " v['value'] >= 0 "
|
|
57
|
+
|
|
58
|
+
- KEY:
|
|
59
|
+
NAME: scheduler_lr_warmup_factor
|
|
60
|
+
REQ: EXACTLY_ONE
|
|
61
|
+
CHECK: " v['value'] >= 1. "
|
|
62
|
+
|
|
63
|
+
# Dataset Properties
|
|
64
|
+
- KEY:
|
|
65
|
+
NAME: train_samples
|
|
66
|
+
REQ: EXACTLY_ONE
|
|
67
|
+
CHECK: " v['value'] == 121266"
|
|
68
|
+
|
|
69
|
+
- KEY:
|
|
70
|
+
NAME: eval_samples
|
|
71
|
+
REQ: EXACTLY_ONE
|
|
72
|
+
CHECK: " v['value'] == 15158"
|
|
73
|
+
|
|
74
|
+
# Convergence Properties
|
|
75
|
+
- KEY:
|
|
76
|
+
NAME: eval_accuracy
|
|
77
|
+
REQ: AT_LEAST_ONE
|
|
78
|
+
CHECK:
|
|
79
|
+
- "'epoch_num' in v['metadata']"
|
|
80
|
+
ATLEAST_ONE_CHECK: "v['value'] >= 0.82 and v['value'] <= 1."
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Optimizer Parameters
|
|
2
|
+
- KEY:
|
|
3
|
+
NAME: opt_bias_correction
|
|
4
|
+
REQ: EXACTLY_ONE
|
|
5
|
+
CHECK: " v['value'] "
|
|
6
|
+
|
|
7
|
+
- KEY:
|
|
8
|
+
NAME: opt_grad_averaging
|
|
9
|
+
REQ: EXACTLY_ONE
|
|
10
|
+
CHECK: " v['value'] "
|
|
11
|
+
|
|
12
|
+
- KEY:
|
|
13
|
+
NAME: opt_max_grad_norm
|
|
14
|
+
REQ: EXACTLY_ONE
|
|
15
|
+
CHECK: " v['value'] == 1."
|
|
16
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
- KEY:
|
|
2
|
+
NAME: global_batch_size
|
|
3
|
+
REQ: EXACTLY_ONE
|
|
4
|
+
CHECK: " v['value'] > 0"
|
|
5
|
+
|
|
6
|
+
- KEY:
|
|
7
|
+
NAME: opt_name
|
|
8
|
+
REQ: EXACTLY_ONE
|
|
9
|
+
CHECK: " v['value'] == 'AdamW'"
|
|
10
|
+
|
|
11
|
+
- KEY:
|
|
12
|
+
NAME: opt_base_learning_rate
|
|
13
|
+
REQ: EXACTLY_ONE
|
|
14
|
+
CHECK: " v['value'] >= 0."
|
|
15
|
+
|
|
16
|
+
- KEY:
|
|
17
|
+
NAME: opt_learning_rate_warmup_steps
|
|
18
|
+
REQ: EXACTLY_ONE
|
|
19
|
+
CHECK: " v['value'] >= 0"
|
|
20
|
+
|
|
21
|
+
- KEY:
|
|
22
|
+
NAME: opt_learning_rate_warmup_factor
|
|
23
|
+
REQ: EXACTLY_ONE
|
|
24
|
+
CHECK: " v['value'] >= 0."
|
|
25
|
+
|
|
26
|
+
- KEY:
|
|
27
|
+
NAME: opt_learning_rate_decay_boundary_steps
|
|
28
|
+
REQ: EXACTLY_ONE
|
|
29
|
+
|
|
30
|
+
- KEY:
|
|
31
|
+
NAME: opt_learning_rate_decay_factor
|
|
32
|
+
REQ: EXACTLY_ONE
|
|
33
|
+
|
|
34
|
+
- KEY:
|
|
35
|
+
NAME: eval_error
|
|
36
|
+
REQ: AT_LEAST_ONE
|
|
37
|
+
CHECK:
|
|
38
|
+
- "'epoch_num' in v['metadata']"
|
|
39
|
+
ATLEAST_ONE_CHECK: "v['value'] <= 0.036 and v['value'] > 0."
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules.
|
|
2
|
+
# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line.
|
|
3
|
+
# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'.
|
|
4
|
+
# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will
|
|
5
|
+
# be executed before any checks.
|
|
6
|
+
# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will
|
|
7
|
+
# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported.
|
|
8
|
+
#
|
|
9
|
+
# KEY record:
|
|
10
|
+
# NAME
|
|
11
|
+
# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE}
|
|
12
|
+
# PRE - optional - code to be executed before CHECK
|
|
13
|
+
# CHECK - optional - expression to be evaluated to verify correctness
|
|
14
|
+
# POST - optional - code to be executed after CHECK
|
|
15
|
+
|
|
16
|
+
- BEGIN:
|
|
17
|
+
CODE: >
|
|
18
|
+
s.update({
|
|
19
|
+
'init_started': False,
|
|
20
|
+
'init_stopped' : False,
|
|
21
|
+
'run_started' : False,
|
|
22
|
+
'run_stopped' : False,
|
|
23
|
+
'in_epoch' : False,
|
|
24
|
+
'last_epoch' : 0,
|
|
25
|
+
'in_block' : False,
|
|
26
|
+
'block_first_epoch' : -1,
|
|
27
|
+
'first_init_start': 9e99,
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
- KEY:
|
|
31
|
+
NAME: submission_org
|
|
32
|
+
REQ: EXACTLY_ONE
|
|
33
|
+
CHECK: " v['value'] != '' "
|
|
34
|
+
|
|
35
|
+
- KEY:
|
|
36
|
+
NAME: submission_platform
|
|
37
|
+
REQ: EXACTLY_ONE
|
|
38
|
+
CHECK: " v['value'] != '' "
|
|
39
|
+
|
|
40
|
+
- KEY:
|
|
41
|
+
NAME: submission_division
|
|
42
|
+
REQ: EXACTLY_ONE
|
|
43
|
+
CHECK: " v['value'] in ['closed', 'open'] "
|
|
44
|
+
POST: " enqueue_config('hpc_1.0.0/{}_common.yaml'.format(v['value'])) "
|
|
45
|
+
|
|
46
|
+
- KEY:
|
|
47
|
+
NAME: submission_status
|
|
48
|
+
REQ: EXACTLY_ONE
|
|
49
|
+
CHECK: " v['value'] in ['cloud', 'onprem', 'preview', 'research'] "
|
|
50
|
+
|
|
51
|
+
# at least one record should be found, but any found records must pass the test
|
|
52
|
+
- KEY:
|
|
53
|
+
NAME: cache_clear
|
|
54
|
+
REQ: AT_LEAST_ONE
|
|
55
|
+
CHECK:
|
|
56
|
+
- "'value' in v"
|
|
57
|
+
|
|
58
|
+
# frequency not checked
|
|
59
|
+
- KEY:
|
|
60
|
+
NAME: init_start
|
|
61
|
+
REQ: AT_LEAST_ONE
|
|
62
|
+
CHECK:
|
|
63
|
+
- "not s['init_stopped']"
|
|
64
|
+
- "not s['run_started']"
|
|
65
|
+
POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) "
|
|
66
|
+
|
|
67
|
+
# confirm less than 20min since the very first init_start
|
|
68
|
+
- KEY:
|
|
69
|
+
NAME: init_stop
|
|
70
|
+
REQ: EXACTLY_ONE
|
|
71
|
+
CHECK:
|
|
72
|
+
- "s['init_started']"
|
|
73
|
+
- "not s['run_started']"
|
|
74
|
+
- "ll.timestamp - s['first_init_start'] < (30*60*1e3)"
|
|
75
|
+
POST: " s['init_stopped'] = True"
|
|
76
|
+
|
|
77
|
+
# HPC requires data staging to be included in run timing
|
|
78
|
+
- KEY:
|
|
79
|
+
NAME: staging_start
|
|
80
|
+
CHECK:
|
|
81
|
+
- "s['run_started']"
|
|
82
|
+
POST: " s['staging_started'] = True "
|
|
83
|
+
|
|
84
|
+
- KEY:
|
|
85
|
+
NAME: staging_stop
|
|
86
|
+
CHECK:
|
|
87
|
+
- "s['staging_started']"
|
|
88
|
+
|
|
89
|
+
# run start and run stop
|
|
90
|
+
- KEY:
|
|
91
|
+
NAME: run_start
|
|
92
|
+
REQ: EXACTLY_ONE
|
|
93
|
+
CHECK: " ( s['init_stopped'] == True )"
|
|
94
|
+
POST: " s['run_started'] = True "
|
|
95
|
+
|
|
96
|
+
# status can also be aborted, but not allowing it here for now
|
|
97
|
+
# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok
|
|
98
|
+
- KEY:
|
|
99
|
+
NAME: run_stop
|
|
100
|
+
REQ: EXACTLY_ONE
|
|
101
|
+
CHECK:
|
|
102
|
+
- "s['run_started']"
|
|
103
|
+
- "'status' in v['metadata']"
|
|
104
|
+
POST: " s['run_stopped'] = True "
|
|
105
|
+
|
|
106
|
+
# FIXME: check epoch_count value match
|
|
107
|
+
- KEY:
|
|
108
|
+
NAME: block_start
|
|
109
|
+
REQ: AT_LEAST_ONE_OR(epoch_start)
|
|
110
|
+
CHECK:
|
|
111
|
+
- "s['run_started']"
|
|
112
|
+
- "'epoch_count' in v['metadata']"
|
|
113
|
+
- "'first_epoch_num' in v['metadata']"
|
|
114
|
+
- "v['metadata']['epoch_count'] > 0"
|
|
115
|
+
|
|
116
|
+
- KEY:
|
|
117
|
+
NAME: block_stop
|
|
118
|
+
REQ: AT_LEAST_ONE_OR(epoch_stop)
|
|
119
|
+
CHECK:
|
|
120
|
+
- "'first_epoch_num' in v['metadata']"
|
|
121
|
+
|
|
122
|
+
- KEY:
|
|
123
|
+
NAME: epoch_start
|
|
124
|
+
REQ: AT_LEAST_ONE_OR(block_start)
|
|
125
|
+
CHECK:
|
|
126
|
+
- "'epoch_num' in v['metadata']"
|
|
127
|
+
|
|
128
|
+
- KEY:
|
|
129
|
+
NAME: epoch_stop
|
|
130
|
+
REQ: AT_LEAST_ONE_OR(block_stop)
|
|
131
|
+
CHECK:
|
|
132
|
+
- "'epoch_num' in v['metadata']"
|
|
133
|
+
|
|
134
|
+
# making sure previous eval did print it's accuracy result
|
|
135
|
+
- KEY:
|
|
136
|
+
NAME: eval_start
|
|
137
|
+
REQ: AT_LEAST_ONE_OR(block_start)
|
|
138
|
+
CHECK:
|
|
139
|
+
- "'epoch_num' in v['metadata']"
|
|
140
|
+
|
|
141
|
+
- KEY:
|
|
142
|
+
NAME: eval_stop
|
|
143
|
+
REQ: AT_LEAST_ONE_OR(block_stop)
|
|
144
|
+
CHECK:
|
|
145
|
+
- "'epoch_num' in v['metadata']"
|
|
146
|
+
|
|
147
|
+
- KEY:
|
|
148
|
+
NAME: train_samples
|
|
149
|
+
REQ: EXACTLY_ONE
|
|
150
|
+
CHECK: " v['value'] != '' "
|
|
151
|
+
|
|
152
|
+
- KEY:
|
|
153
|
+
NAME: eval_samples
|
|
154
|
+
REQ: EXACTLY_ONE
|
|
155
|
+
CHECK: " v['value'] != '' "
|
|
156
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
|
|
2
|
+
- KEY:
|
|
3
|
+
NAME: submission_benchmark
|
|
4
|
+
REQ: EXACTLY_ONE
|
|
5
|
+
CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
|
|
6
|
+
POST: " enqueue_config('hpc_2.0.0/closed_{}.yaml'.format(v['value'])) "
|
|
7
|
+
|
|
8
|
+
- KEY:
|
|
9
|
+
NAME: gradient_accumulation_steps
|
|
10
|
+
CHECK: " v['value'] > 0 "
|
|
11
|
+
|
|
12
|
+
- KEY:
|
|
13
|
+
NAME: gradient_accumulation_frequency
|
|
14
|
+
CHECK: " v['value'] > 0 "
|
|
15
|
+
|
|
16
|
+
- KEY:
|
|
17
|
+
NAME: number_of_nodes
|
|
18
|
+
REQ: EXACTLY_ONE
|
|
19
|
+
CHECK: " v['value'] > 0"
|
|
20
|
+
|
|
21
|
+
- KEY:
|
|
22
|
+
NAME: accelerators_per_node
|
|
23
|
+
REQ: EXACTLY_ONE
|
|
24
|
+
CHECK: " v['value'] >= 0"
|