mlperf-logging 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. mlperf_logging/__init__.py +0 -0
  2. mlperf_logging/benchmark_meta.py +184 -0
  3. mlperf_logging/compliance_checker/__init__.py +0 -0
  4. mlperf_logging/compliance_checker/__main__.py +32 -0
  5. mlperf_logging/compliance_checker/hpc_1.0.0/closed_common.yaml +24 -0
  6. mlperf_logging/compliance_checker/hpc_1.0.0/closed_cosmoflow.yaml +47 -0
  7. mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam.yaml +80 -0
  8. mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_cosine_annealing.yaml +10 -0
  9. mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_lamb.yaml +16 -0
  10. mlperf_logging/compliance_checker/hpc_1.0.0/closed_deepcam_multistep.yaml +10 -0
  11. mlperf_logging/compliance_checker/hpc_1.0.0/closed_oc20.yaml +39 -0
  12. mlperf_logging/compliance_checker/hpc_1.0.0/common.yaml +156 -0
  13. mlperf_logging/compliance_checker/hpc_1.0.0/open_common.yaml +5 -0
  14. mlperf_logging/compliance_checker/hpc_1.0.0/open_cosmoflow.yaml +6 -0
  15. mlperf_logging/compliance_checker/hpc_1.0.0/open_deepcam.yaml +6 -0
  16. mlperf_logging/compliance_checker/hpc_1.0.0/open_oc20.yaml +6 -0
  17. mlperf_logging/compliance_checker/hpc_2.0.0/closed_common.yaml +24 -0
  18. mlperf_logging/compliance_checker/hpc_2.0.0/closed_cosmoflow.yaml +47 -0
  19. mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam.yaml +80 -0
  20. mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam_cosine_annealing.yaml +10 -0
  21. mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam_lamb.yaml +16 -0
  22. mlperf_logging/compliance_checker/hpc_2.0.0/closed_deepcam_multistep.yaml +10 -0
  23. mlperf_logging/compliance_checker/hpc_2.0.0/closed_oc20.yaml +39 -0
  24. mlperf_logging/compliance_checker/hpc_2.0.0/common.yaml +156 -0
  25. mlperf_logging/compliance_checker/hpc_2.0.0/open_common.yaml +5 -0
  26. mlperf_logging/compliance_checker/hpc_2.0.0/open_cosmoflow.yaml +6 -0
  27. mlperf_logging/compliance_checker/hpc_2.0.0/open_deepcam.yaml +6 -0
  28. mlperf_logging/compliance_checker/hpc_2.0.0/open_oc20.yaml +6 -0
  29. mlperf_logging/compliance_checker/hpc_3.0.0/closed_common.yaml +24 -0
  30. mlperf_logging/compliance_checker/hpc_3.0.0/closed_cosmoflow.yaml +47 -0
  31. mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam.yaml +80 -0
  32. mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam_cosine_annealing.yaml +10 -0
  33. mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam_lamb.yaml +16 -0
  34. mlperf_logging/compliance_checker/hpc_3.0.0/closed_deepcam_multistep.yaml +10 -0
  35. mlperf_logging/compliance_checker/hpc_3.0.0/closed_oc20.yaml +39 -0
  36. mlperf_logging/compliance_checker/hpc_3.0.0/closed_openfold.yaml +196 -0
  37. mlperf_logging/compliance_checker/hpc_3.0.0/common.yaml +154 -0
  38. mlperf_logging/compliance_checker/hpc_3.0.0/open_common.yaml +5 -0
  39. mlperf_logging/compliance_checker/hpc_3.0.0/open_cosmoflow.yaml +6 -0
  40. mlperf_logging/compliance_checker/hpc_3.0.0/open_deepcam.yaml +6 -0
  41. mlperf_logging/compliance_checker/hpc_3.0.0/open_oc20.yaml +6 -0
  42. mlperf_logging/compliance_checker/hpc_3.0.0/open_openfold.yaml +6 -0
  43. mlperf_logging/compliance_checker/mlp_compliance.py +345 -0
  44. mlperf_logging/compliance_checker/mlp_parser/__init__.py +35 -0
  45. mlperf_logging/compliance_checker/mlp_parser/ruleset_060.py +111 -0
  46. mlperf_logging/compliance_checker/mlp_parser/ruleset_070.py +104 -0
  47. mlperf_logging/compliance_checker/mlp_parser/ruleset_100.py +104 -0
  48. mlperf_logging/compliance_checker/mlp_parser/ruleset_110.py +104 -0
  49. mlperf_logging/compliance_checker/mlp_parser/ruleset_200.py +104 -0
  50. mlperf_logging/compliance_checker/mlp_parser/ruleset_210.py +104 -0
  51. mlperf_logging/compliance_checker/mlp_parser/ruleset_300.py +104 -0
  52. mlperf_logging/compliance_checker/mlp_parser/ruleset_310.py +105 -0
  53. mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py +105 -0
  54. mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py +105 -0
  55. mlperf_logging/compliance_checker/training_0.6.0/common.yaml +135 -0
  56. mlperf_logging/compliance_checker/training_0.6.0/gnmt.yaml +51 -0
  57. mlperf_logging/compliance_checker/training_0.6.0/maskrcnn.yaml +27 -0
  58. mlperf_logging/compliance_checker/training_0.6.0/minigo.yaml +147 -0
  59. mlperf_logging/compliance_checker/training_0.6.0/resnet.yaml +59 -0
  60. mlperf_logging/compliance_checker/training_0.6.0/score.yaml +20 -0
  61. mlperf_logging/compliance_checker/training_0.6.0/ssd.yaml +36 -0
  62. mlperf_logging/compliance_checker/training_0.6.0/transformer.yaml +41 -0
  63. mlperf_logging/compliance_checker/training_0.7.0/closed_bert.yaml +49 -0
  64. mlperf_logging/compliance_checker/training_0.7.0/closed_common.yaml +6 -0
  65. mlperf_logging/compliance_checker/training_0.7.0/closed_dlrm.yaml +36 -0
  66. mlperf_logging/compliance_checker/training_0.7.0/closed_gnmt.yaml +47 -0
  67. mlperf_logging/compliance_checker/training_0.7.0/closed_maskrcnn.yaml +57 -0
  68. mlperf_logging/compliance_checker/training_0.7.0/closed_minigo.yaml +43 -0
  69. mlperf_logging/compliance_checker/training_0.7.0/closed_resnet.yaml +18 -0
  70. mlperf_logging/compliance_checker/training_0.7.0/closed_resnet_lars.yaml +37 -0
  71. mlperf_logging/compliance_checker/training_0.7.0/closed_resnet_sgd.yaml +36 -0
  72. mlperf_logging/compliance_checker/training_0.7.0/closed_ssd.yaml +45 -0
  73. mlperf_logging/compliance_checker/training_0.7.0/closed_transformer.yaml +50 -0
  74. mlperf_logging/compliance_checker/training_0.7.0/common.yaml +154 -0
  75. mlperf_logging/compliance_checker/training_0.7.0/open_bert.yaml +7 -0
  76. mlperf_logging/compliance_checker/training_0.7.0/open_common.yaml +6 -0
  77. mlperf_logging/compliance_checker/training_0.7.0/open_dlrm.yaml +7 -0
  78. mlperf_logging/compliance_checker/training_0.7.0/open_gnmt.yaml +8 -0
  79. mlperf_logging/compliance_checker/training_0.7.0/open_maskrcnn.yaml +12 -0
  80. mlperf_logging/compliance_checker/training_0.7.0/open_minigo.yaml +9 -0
  81. mlperf_logging/compliance_checker/training_0.7.0/open_resnet.yaml +7 -0
  82. mlperf_logging/compliance_checker/training_0.7.0/open_ssd.yaml +7 -0
  83. mlperf_logging/compliance_checker/training_0.7.0/open_transformer.yaml +7 -0
  84. mlperf_logging/compliance_checker/training_0.7.0_warn/bert.yaml +50 -0
  85. mlperf_logging/compliance_checker/training_0.7.0_warn/common.yaml +210 -0
  86. mlperf_logging/compliance_checker/training_0.7.0_warn/dlrm.yaml +17 -0
  87. mlperf_logging/compliance_checker/training_0.7.0_warn/gnmt.yaml +52 -0
  88. mlperf_logging/compliance_checker/training_0.7.0_warn/maskrcnn.yaml +56 -0
  89. mlperf_logging/compliance_checker/training_0.7.0_warn/minigo.yaml +85 -0
  90. mlperf_logging/compliance_checker/training_0.7.0_warn/resnet.yaml +26 -0
  91. mlperf_logging/compliance_checker/training_0.7.0_warn/resnet_lars.yaml +27 -0
  92. mlperf_logging/compliance_checker/training_0.7.0_warn/resnet_sgd.yaml +6 -0
  93. mlperf_logging/compliance_checker/training_0.7.0_warn/ssd.yaml +41 -0
  94. mlperf_logging/compliance_checker/training_0.7.0_warn/transformer.yaml +44 -0
  95. mlperf_logging/compliance_checker/training_1.0.0/closed_bert.yaml +49 -0
  96. mlperf_logging/compliance_checker/training_1.0.0/closed_common.yaml +11 -0
  97. mlperf_logging/compliance_checker/training_1.0.0/closed_dlrm.yaml +36 -0
  98. mlperf_logging/compliance_checker/training_1.0.0/closed_maskrcnn.yaml +57 -0
  99. mlperf_logging/compliance_checker/training_1.0.0/closed_minigo.yaml +43 -0
  100. mlperf_logging/compliance_checker/training_1.0.0/closed_resnet.yaml +18 -0
  101. mlperf_logging/compliance_checker/training_1.0.0/closed_resnet_lars.yaml +37 -0
  102. mlperf_logging/compliance_checker/training_1.0.0/closed_resnet_sgd.yaml +36 -0
  103. mlperf_logging/compliance_checker/training_1.0.0/closed_rnnt.yaml +159 -0
  104. mlperf_logging/compliance_checker/training_1.0.0/closed_ssd.yaml +45 -0
  105. mlperf_logging/compliance_checker/training_1.0.0/closed_unet3d.yaml +73 -0
  106. mlperf_logging/compliance_checker/training_1.0.0/common.yaml +149 -0
  107. mlperf_logging/compliance_checker/training_1.0.0/open_bert.yaml +7 -0
  108. mlperf_logging/compliance_checker/training_1.0.0/open_common.yaml +7 -0
  109. mlperf_logging/compliance_checker/training_1.0.0/open_dlrm.yaml +7 -0
  110. mlperf_logging/compliance_checker/training_1.0.0/open_maskrcnn.yaml +12 -0
  111. mlperf_logging/compliance_checker/training_1.0.0/open_minigo.yaml +9 -0
  112. mlperf_logging/compliance_checker/training_1.0.0/open_resnet.yaml +7 -0
  113. mlperf_logging/compliance_checker/training_1.0.0/open_rnnt.yaml +7 -0
  114. mlperf_logging/compliance_checker/training_1.0.0/open_ssd.yaml +7 -0
  115. mlperf_logging/compliance_checker/training_1.0.0/open_unet3d.yaml +7 -0
  116. mlperf_logging/compliance_checker/training_1.1.0/closed_bert.yaml +463 -0
  117. mlperf_logging/compliance_checker/training_1.1.0/closed_common.yaml +11 -0
  118. mlperf_logging/compliance_checker/training_1.1.0/closed_dlrm.yaml +61 -0
  119. mlperf_logging/compliance_checker/training_1.1.0/closed_maskrcnn.yaml +94 -0
  120. mlperf_logging/compliance_checker/training_1.1.0/closed_minigo.yaml +108 -0
  121. mlperf_logging/compliance_checker/training_1.1.0/closed_resnet.yaml +195 -0
  122. mlperf_logging/compliance_checker/training_1.1.0/closed_resnet_lars.yaml +37 -0
  123. mlperf_logging/compliance_checker/training_1.1.0/closed_resnet_sgd.yaml +36 -0
  124. mlperf_logging/compliance_checker/training_1.1.0/closed_rnnt.yaml +159 -0
  125. mlperf_logging/compliance_checker/training_1.1.0/closed_ssd.yaml +181 -0
  126. mlperf_logging/compliance_checker/training_1.1.0/closed_unet3d.yaml +140 -0
  127. mlperf_logging/compliance_checker/training_1.1.0/common.yaml +149 -0
  128. mlperf_logging/compliance_checker/training_1.1.0/open_bert.yaml +7 -0
  129. mlperf_logging/compliance_checker/training_1.1.0/open_common.yaml +7 -0
  130. mlperf_logging/compliance_checker/training_1.1.0/open_dlrm.yaml +7 -0
  131. mlperf_logging/compliance_checker/training_1.1.0/open_maskrcnn.yaml +12 -0
  132. mlperf_logging/compliance_checker/training_1.1.0/open_minigo.yaml +9 -0
  133. mlperf_logging/compliance_checker/training_1.1.0/open_resnet.yaml +7 -0
  134. mlperf_logging/compliance_checker/training_1.1.0/open_rnnt.yaml +7 -0
  135. mlperf_logging/compliance_checker/training_1.1.0/open_ssd.yaml +7 -0
  136. mlperf_logging/compliance_checker/training_1.1.0/open_unet3d.yaml +7 -0
  137. mlperf_logging/compliance_checker/training_2.0.0/closed_bert.yaml +463 -0
  138. mlperf_logging/compliance_checker/training_2.0.0/closed_common.yaml +11 -0
  139. mlperf_logging/compliance_checker/training_2.0.0/closed_dlrm.yaml +61 -0
  140. mlperf_logging/compliance_checker/training_2.0.0/closed_maskrcnn.yaml +96 -0
  141. mlperf_logging/compliance_checker/training_2.0.0/closed_minigo.yaml +108 -0
  142. mlperf_logging/compliance_checker/training_2.0.0/closed_resnet.yaml +195 -0
  143. mlperf_logging/compliance_checker/training_2.0.0/closed_resnet_lars.yaml +37 -0
  144. mlperf_logging/compliance_checker/training_2.0.0/closed_resnet_sgd.yaml +36 -0
  145. mlperf_logging/compliance_checker/training_2.0.0/closed_rnnt.yaml +159 -0
  146. mlperf_logging/compliance_checker/training_2.0.0/closed_ssd.yaml +141 -0
  147. mlperf_logging/compliance_checker/training_2.0.0/closed_unet3d.yaml +140 -0
  148. mlperf_logging/compliance_checker/training_2.0.0/common.yaml +150 -0
  149. mlperf_logging/compliance_checker/training_2.0.0/open_bert.yaml +7 -0
  150. mlperf_logging/compliance_checker/training_2.0.0/open_common.yaml +7 -0
  151. mlperf_logging/compliance_checker/training_2.0.0/open_dlrm.yaml +7 -0
  152. mlperf_logging/compliance_checker/training_2.0.0/open_maskrcnn.yaml +12 -0
  153. mlperf_logging/compliance_checker/training_2.0.0/open_minigo.yaml +9 -0
  154. mlperf_logging/compliance_checker/training_2.0.0/open_resnet.yaml +7 -0
  155. mlperf_logging/compliance_checker/training_2.0.0/open_rnnt.yaml +7 -0
  156. mlperf_logging/compliance_checker/training_2.0.0/open_ssd.yaml +7 -0
  157. mlperf_logging/compliance_checker/training_2.0.0/open_unet3d.yaml +7 -0
  158. mlperf_logging/compliance_checker/training_2.1.0/closed_bert.yaml +463 -0
  159. mlperf_logging/compliance_checker/training_2.1.0/closed_common.yaml +11 -0
  160. mlperf_logging/compliance_checker/training_2.1.0/closed_dlrm.yaml +61 -0
  161. mlperf_logging/compliance_checker/training_2.1.0/closed_maskrcnn.yaml +96 -0
  162. mlperf_logging/compliance_checker/training_2.1.0/closed_minigo.yaml +108 -0
  163. mlperf_logging/compliance_checker/training_2.1.0/closed_resnet.yaml +195 -0
  164. mlperf_logging/compliance_checker/training_2.1.0/closed_resnet_lars.yaml +37 -0
  165. mlperf_logging/compliance_checker/training_2.1.0/closed_resnet_sgd.yaml +36 -0
  166. mlperf_logging/compliance_checker/training_2.1.0/closed_rnnt.yaml +159 -0
  167. mlperf_logging/compliance_checker/training_2.1.0/closed_ssd.yaml +141 -0
  168. mlperf_logging/compliance_checker/training_2.1.0/closed_unet3d.yaml +140 -0
  169. mlperf_logging/compliance_checker/training_2.1.0/common.yaml +150 -0
  170. mlperf_logging/compliance_checker/training_2.1.0/open_bert.yaml +7 -0
  171. mlperf_logging/compliance_checker/training_2.1.0/open_common.yaml +7 -0
  172. mlperf_logging/compliance_checker/training_2.1.0/open_dlrm.yaml +7 -0
  173. mlperf_logging/compliance_checker/training_2.1.0/open_maskrcnn.yaml +12 -0
  174. mlperf_logging/compliance_checker/training_2.1.0/open_minigo.yaml +9 -0
  175. mlperf_logging/compliance_checker/training_2.1.0/open_resnet.yaml +7 -0
  176. mlperf_logging/compliance_checker/training_2.1.0/open_rnnt.yaml +7 -0
  177. mlperf_logging/compliance_checker/training_2.1.0/open_ssd.yaml +7 -0
  178. mlperf_logging/compliance_checker/training_2.1.0/open_unet3d.yaml +7 -0
  179. mlperf_logging/compliance_checker/training_3.0.0/closed_bert.yaml +48 -0
  180. mlperf_logging/compliance_checker/training_3.0.0/closed_common.yaml +11 -0
  181. mlperf_logging/compliance_checker/training_3.0.0/closed_dlrm.yaml +35 -0
  182. mlperf_logging/compliance_checker/training_3.0.0/closed_dlrm_dcnv2.yaml +59 -0
  183. mlperf_logging/compliance_checker/training_3.0.0/closed_gpt3.yaml +79 -0
  184. mlperf_logging/compliance_checker/training_3.0.0/closed_maskrcnn.yaml +57 -0
  185. mlperf_logging/compliance_checker/training_3.0.0/closed_minigo.yaml +43 -0
  186. mlperf_logging/compliance_checker/training_3.0.0/closed_resnet.yaml +17 -0
  187. mlperf_logging/compliance_checker/training_3.0.0/closed_resnet_lars.yaml +37 -0
  188. mlperf_logging/compliance_checker/training_3.0.0/closed_resnet_sgd.yaml +36 -0
  189. mlperf_logging/compliance_checker/training_3.0.0/closed_rnnt.yaml +138 -0
  190. mlperf_logging/compliance_checker/training_3.0.0/closed_ssd.yaml +35 -0
  191. mlperf_logging/compliance_checker/training_3.0.0/closed_unet3d.yaml +73 -0
  192. mlperf_logging/compliance_checker/training_3.0.0/common.yaml +150 -0
  193. mlperf_logging/compliance_checker/training_3.0.0/open_bert.yaml +7 -0
  194. mlperf_logging/compliance_checker/training_3.0.0/open_common.yaml +7 -0
  195. mlperf_logging/compliance_checker/training_3.0.0/open_dlrm_dcnv2.yaml +7 -0
  196. mlperf_logging/compliance_checker/training_3.0.0/open_maskrcnn.yaml +12 -0
  197. mlperf_logging/compliance_checker/training_3.0.0/open_minigo.yaml +9 -0
  198. mlperf_logging/compliance_checker/training_3.0.0/open_resnet.yaml +7 -0
  199. mlperf_logging/compliance_checker/training_3.0.0/open_rnnt.yaml +7 -0
  200. mlperf_logging/compliance_checker/training_3.0.0/open_ssd.yaml +7 -0
  201. mlperf_logging/compliance_checker/training_3.0.0/open_unet3d.yaml +7 -0
  202. mlperf_logging/compliance_checker/training_3.1.0/closed_bert.yaml +48 -0
  203. mlperf_logging/compliance_checker/training_3.1.0/closed_common.yaml +11 -0
  204. mlperf_logging/compliance_checker/training_3.1.0/closed_dlrm.yaml +35 -0
  205. mlperf_logging/compliance_checker/training_3.1.0/closed_dlrm_dcnv2.yaml +59 -0
  206. mlperf_logging/compliance_checker/training_3.1.0/closed_gpt3.yaml +79 -0
  207. mlperf_logging/compliance_checker/training_3.1.0/closed_maskrcnn.yaml +57 -0
  208. mlperf_logging/compliance_checker/training_3.1.0/closed_minigo.yaml +43 -0
  209. mlperf_logging/compliance_checker/training_3.1.0/closed_resnet.yaml +17 -0
  210. mlperf_logging/compliance_checker/training_3.1.0/closed_resnet_lars.yaml +37 -0
  211. mlperf_logging/compliance_checker/training_3.1.0/closed_resnet_sgd.yaml +36 -0
  212. mlperf_logging/compliance_checker/training_3.1.0/closed_rnnt.yaml +138 -0
  213. mlperf_logging/compliance_checker/training_3.1.0/closed_ssd.yaml +35 -0
  214. mlperf_logging/compliance_checker/training_3.1.0/closed_stable_diffusion.yaml +74 -0
  215. mlperf_logging/compliance_checker/training_3.1.0/closed_unet3d.yaml +73 -0
  216. mlperf_logging/compliance_checker/training_3.1.0/common.yaml +151 -0
  217. mlperf_logging/compliance_checker/training_3.1.0/open_bert.yaml +7 -0
  218. mlperf_logging/compliance_checker/training_3.1.0/open_common.yaml +7 -0
  219. mlperf_logging/compliance_checker/training_3.1.0/open_dlrm_dcnv2.yaml +7 -0
  220. mlperf_logging/compliance_checker/training_3.1.0/open_maskrcnn.yaml +12 -0
  221. mlperf_logging/compliance_checker/training_3.1.0/open_minigo.yaml +9 -0
  222. mlperf_logging/compliance_checker/training_3.1.0/open_resnet.yaml +7 -0
  223. mlperf_logging/compliance_checker/training_3.1.0/open_rnnt.yaml +7 -0
  224. mlperf_logging/compliance_checker/training_3.1.0/open_ssd.yaml +7 -0
  225. mlperf_logging/compliance_checker/training_3.1.0/open_stable_diffusion.yaml +33 -0
  226. mlperf_logging/compliance_checker/training_3.1.0/open_unet3d.yaml +7 -0
  227. mlperf_logging/compliance_checker/training_4.0.0/closed_bert.yaml +48 -0
  228. mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml +11 -0
  229. mlperf_logging/compliance_checker/training_4.0.0/closed_dlrm_dcnv2.yaml +59 -0
  230. mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml +21 -0
  231. mlperf_logging/compliance_checker/training_4.0.0/closed_gpt3.yaml +79 -0
  232. mlperf_logging/compliance_checker/training_4.0.0/closed_llama2_70b_lora.yaml +45 -0
  233. mlperf_logging/compliance_checker/training_4.0.0/closed_resnet.yaml +17 -0
  234. mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_lars.yaml +37 -0
  235. mlperf_logging/compliance_checker/training_4.0.0/closed_resnet_sgd.yaml +36 -0
  236. mlperf_logging/compliance_checker/training_4.0.0/closed_ssd.yaml +35 -0
  237. mlperf_logging/compliance_checker/training_4.0.0/closed_stable_diffusion.yaml +74 -0
  238. mlperf_logging/compliance_checker/training_4.0.0/closed_unet3d.yaml +73 -0
  239. mlperf_logging/compliance_checker/training_4.0.0/common.yaml +151 -0
  240. mlperf_logging/compliance_checker/training_4.0.0/open_bert.yaml +7 -0
  241. mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml +6 -0
  242. mlperf_logging/compliance_checker/training_4.0.0/open_dlrm_dcnv2.yaml +7 -0
  243. mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml +7 -0
  244. mlperf_logging/compliance_checker/training_4.0.0/open_gpt3.yaml +79 -0
  245. mlperf_logging/compliance_checker/training_4.0.0/open_llama2_70b_lora.yaml +7 -0
  246. mlperf_logging/compliance_checker/training_4.0.0/open_resnet.yaml +7 -0
  247. mlperf_logging/compliance_checker/training_4.0.0/open_ssd.yaml +7 -0
  248. mlperf_logging/compliance_checker/training_4.0.0/open_stable_diffusion.yaml +33 -0
  249. mlperf_logging/compliance_checker/training_4.0.0/open_unet3d.yaml +7 -0
  250. mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml +48 -0
  251. mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml +11 -0
  252. mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml +59 -0
  253. mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml +21 -0
  254. mlperf_logging/compliance_checker/training_4.1.0/closed_gpt3.yaml +79 -0
  255. mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml +42 -0
  256. mlperf_logging/compliance_checker/training_4.1.0/closed_ssd.yaml +35 -0
  257. mlperf_logging/compliance_checker/training_4.1.0/closed_stable_diffusion.yaml +74 -0
  258. mlperf_logging/compliance_checker/training_4.1.0/common.yaml +146 -0
  259. mlperf_logging/compliance_checker/training_4.1.0/open_bert.yaml +7 -0
  260. mlperf_logging/compliance_checker/training_4.1.0/open_common.yaml +6 -0
  261. mlperf_logging/compliance_checker/training_4.1.0/open_dlrm_dcnv2.yaml +7 -0
  262. mlperf_logging/compliance_checker/training_4.1.0/open_gnn.yaml +7 -0
  263. mlperf_logging/compliance_checker/training_4.1.0/open_gpt3.yaml +79 -0
  264. mlperf_logging/compliance_checker/training_4.1.0/open_llama2_70b_lora.yaml +7 -0
  265. mlperf_logging/compliance_checker/training_4.1.0/open_ssd.yaml +7 -0
  266. mlperf_logging/compliance_checker/training_4.1.0/open_stable_diffusion.yaml +33 -0
  267. mlperf_logging/mllog/__init__.py +97 -0
  268. mlperf_logging/mllog/constants.py +190 -0
  269. mlperf_logging/mllog/examples/__init__.py +14 -0
  270. mlperf_logging/mllog/examples/dummy_example.py +87 -0
  271. mlperf_logging/mllog/examples/linear_regression_example.py +311 -0
  272. mlperf_logging/mllog/examples/power/__init__.py +14 -0
  273. mlperf_logging/mllog/examples/power/compute_metric_example.py +33 -0
  274. mlperf_logging/mllog/examples/power/power_measurement.py +194 -0
  275. mlperf_logging/mllog/examples/power/reader.py +73 -0
  276. mlperf_logging/mllog/mllog.py +261 -0
  277. mlperf_logging/mllog/test_mllog.py +133 -0
  278. mlperf_logging/package_checker/__init__.py +0 -0
  279. mlperf_logging/package_checker/__main__.py +3 -0
  280. mlperf_logging/package_checker/package_checker.py +337 -0
  281. mlperf_logging/package_checker/seed_checker.py +148 -0
  282. mlperf_logging/rcp_checker/__init__.py +0 -0
  283. mlperf_logging/rcp_checker/__main__.py +3 -0
  284. mlperf_logging/rcp_checker/hpc_1.0.0/rcps_cosmoflow.json +66 -0
  285. mlperf_logging/rcp_checker/hpc_1.0.0/rcps_deepcam.json +117 -0
  286. mlperf_logging/rcp_checker/hpc_1.0.0/rcps_oc20.json +41 -0
  287. mlperf_logging/rcp_checker/hpc_2.0.0/rcps_cosmoflow.json +82 -0
  288. mlperf_logging/rcp_checker/hpc_2.0.0/rcps_deepcam.json +119 -0
  289. mlperf_logging/rcp_checker/hpc_2.0.0/rcps_oc20.json +41 -0
  290. mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json +82 -0
  291. mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json +119 -0
  292. mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json +41 -0
  293. mlperf_logging/rcp_checker/hpc_3.0.0/rcps_openfold.json +52 -0
  294. mlperf_logging/rcp_checker/rcp_checker.py +576 -0
  295. mlperf_logging/rcp_checker/training_1.0.0/rcps_bert.json +163 -0
  296. mlperf_logging/rcp_checker/training_1.0.0/rcps_dlrm.json +59 -0
  297. mlperf_logging/rcp_checker/training_1.0.0/rcps_maskrcnn.json +113 -0
  298. mlperf_logging/rcp_checker/training_1.0.0/rcps_resnet.json +192 -0
  299. mlperf_logging/rcp_checker/training_1.0.0/rcps_rnnt.json +75 -0
  300. mlperf_logging/rcp_checker/training_1.0.0/rcps_ssd.json +63 -0
  301. mlperf_logging/rcp_checker/training_1.0.0/rcps_unet3d.json +131 -0
  302. mlperf_logging/rcp_checker/training_1.1.0/rcps_bert.json +256 -0
  303. mlperf_logging/rcp_checker/training_1.1.0/rcps_dlrm.json +65 -0
  304. mlperf_logging/rcp_checker/training_1.1.0/rcps_maskrcnn.json +113 -0
  305. mlperf_logging/rcp_checker/training_1.1.0/rcps_resnet.json +197 -0
  306. mlperf_logging/rcp_checker/training_1.1.0/rcps_rnnt.json +115 -0
  307. mlperf_logging/rcp_checker/training_1.1.0/rcps_ssd.json +93 -0
  308. mlperf_logging/rcp_checker/training_1.1.0/rcps_unet3d.json +146 -0
  309. mlperf_logging/rcp_checker/training_2.0.0/rcps_bert.json +302 -0
  310. mlperf_logging/rcp_checker/training_2.0.0/rcps_dlrm.json +65 -0
  311. mlperf_logging/rcp_checker/training_2.0.0/rcps_maskrcnn.json +131 -0
  312. mlperf_logging/rcp_checker/training_2.0.0/rcps_resnet.json +221 -0
  313. mlperf_logging/rcp_checker/training_2.0.0/rcps_rnnt.json +168 -0
  314. mlperf_logging/rcp_checker/training_2.0.0/rcps_ssd.json +57 -0
  315. mlperf_logging/rcp_checker/training_2.0.0/rcps_unet3d.json +146 -0
  316. mlperf_logging/rcp_checker/training_2.1.0/rcps_bert.json +303 -0
  317. mlperf_logging/rcp_checker/training_2.1.0/rcps_dlrm.json +65 -0
  318. mlperf_logging/rcp_checker/training_2.1.0/rcps_maskrcnn.json +131 -0
  319. mlperf_logging/rcp_checker/training_2.1.0/rcps_resnet.json +221 -0
  320. mlperf_logging/rcp_checker/training_2.1.0/rcps_rnnt.json +168 -0
  321. mlperf_logging/rcp_checker/training_2.1.0/rcps_ssd.json +92 -0
  322. mlperf_logging/rcp_checker/training_2.1.0/rcps_unet3d.json +146 -0
  323. mlperf_logging/rcp_checker/training_3.0.0/rcps_bert.json +303 -0
  324. mlperf_logging/rcp_checker/training_3.0.0/rcps_dlrm_dcnv2.json +133 -0
  325. mlperf_logging/rcp_checker/training_3.0.0/rcps_gpt3.json +78 -0
  326. mlperf_logging/rcp_checker/training_3.0.0/rcps_maskrcnn.json +131 -0
  327. mlperf_logging/rcp_checker/training_3.0.0/rcps_resnet.json +221 -0
  328. mlperf_logging/rcp_checker/training_3.0.0/rcps_rnnt.json +168 -0
  329. mlperf_logging/rcp_checker/training_3.0.0/rcps_ssd.json +127 -0
  330. mlperf_logging/rcp_checker/training_3.0.0/rcps_unet3d.json +182 -0
  331. mlperf_logging/rcp_checker/training_3.1.0/rcps_bert.json +303 -0
  332. mlperf_logging/rcp_checker/training_3.1.0/rcps_dlrm_dcnv2.json +133 -0
  333. mlperf_logging/rcp_checker/training_3.1.0/rcps_gpt3.json +78 -0
  334. mlperf_logging/rcp_checker/training_3.1.0/rcps_maskrcnn.json +131 -0
  335. mlperf_logging/rcp_checker/training_3.1.0/rcps_resnet.json +221 -0
  336. mlperf_logging/rcp_checker/training_3.1.0/rcps_rnnt.json +168 -0
  337. mlperf_logging/rcp_checker/training_3.1.0/rcps_ssd.json +145 -0
  338. mlperf_logging/rcp_checker/training_3.1.0/rcps_stable_diffusion.json +66 -0
  339. mlperf_logging/rcp_checker/training_3.1.0/rcps_unet3d.json +178 -0
  340. mlperf_logging/rcp_checker/training_4.0.0/rcps_bert.json +303 -0
  341. mlperf_logging/rcp_checker/training_4.0.0/rcps_dlrm_dcnv2.json +133 -0
  342. mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json +90 -0
  343. mlperf_logging/rcp_checker/training_4.0.0/rcps_gpt3.json +78 -0
  344. mlperf_logging/rcp_checker/training_4.0.0/rcps_llama2_70b_lora.json +91 -0
  345. mlperf_logging/rcp_checker/training_4.0.0/rcps_resnet.json +221 -0
  346. mlperf_logging/rcp_checker/training_4.0.0/rcps_ssd.json +163 -0
  347. mlperf_logging/rcp_checker/training_4.0.0/rcps_stable_diffusion.json +88 -0
  348. mlperf_logging/rcp_checker/training_4.0.0/rcps_unet3d.json +132 -0
  349. mlperf_logging/rcp_checker/training_4.1.0/rcps_bert.json +303 -0
  350. mlperf_logging/rcp_checker/training_4.1.0/rcps_dlrm_dcnv2.json +133 -0
  351. mlperf_logging/rcp_checker/training_4.1.0/rcps_gnn.json +90 -0
  352. mlperf_logging/rcp_checker/training_4.1.0/rcps_gpt3.json +93 -0
  353. mlperf_logging/rcp_checker/training_4.1.0/rcps_llama2_70b_lora.json +91 -0
  354. mlperf_logging/rcp_checker/training_4.1.0/rcps_ssd.json +163 -0
  355. mlperf_logging/rcp_checker/training_4.1.0/rcps_stable_diffusion.json +88 -0
  356. mlperf_logging/repo_checker/__init__.py +0 -0
  357. mlperf_logging/repo_checker/__main__.py +3 -0
  358. mlperf_logging/repo_checker/repo_checker.py +155 -0
  359. mlperf_logging/result_summarizer/__init__.py +0 -0
  360. mlperf_logging/result_summarizer/__main__.py +3 -0
  361. mlperf_logging/result_summarizer/result_summarizer.py +977 -0
  362. mlperf_logging/system_desc_checker/__init__.py +0 -0
  363. mlperf_logging/system_desc_checker/__main__.py +3 -0
  364. mlperf_logging/system_desc_checker/system_desc_checker.py +179 -0
  365. mlperf_logging-3.0.0.dist-info/LICENSE.md +177 -0
  366. mlperf_logging-3.0.0.dist-info/METADATA +65 -0
  367. mlperf_logging-3.0.0.dist-info/RECORD +369 -0
  368. mlperf_logging-3.0.0.dist-info/WHEEL +5 -0
  369. mlperf_logging-3.0.0.dist-info/top_level.txt +1 -0
File without changes
@@ -0,0 +1,184 @@
1
+ # benchmark dictionary
2
+ _ALL_RESULT_FILE_COUNTS = {
3
+ 'training': {
4
+ 'bert': 10,
5
+ 'dlrm': 5,
6
+ 'dlrm_dcnv2': 10,
7
+ 'gnmt': 10,
8
+ 'gpt3': 3,
9
+ 'maskrcnn': 5,
10
+ 'minigo': 10,
11
+ 'resnet': 5,
12
+ 'ssd': 5,
13
+ 'stable_diffusion': 10,
14
+ 'transformer': 10,
15
+ 'ncf': 10,
16
+ 'rnnt': 10,
17
+ 'unet3d': 40,
18
+ 'gnn' : 10,
19
+ 'llama2_70b_lora': 10,
20
+ },
21
+
22
+ 'hpc' : {
23
+ 'deepcam': 5,
24
+ 'cosmoflow': 10,
25
+ 'oc20': 5,
26
+ 'openfold': 10,
27
+ }
28
+ }
29
+
30
+
31
+ _ALL_ALLOWED_BENCHMARKS = {
32
+ 'training': {
33
+ '0.6': [
34
+ 'resnet',
35
+ 'ssd',
36
+ 'maskrcnn',
37
+ 'gnmt',
38
+ 'transformer',
39
+ 'ncf',
40
+ 'minigo',
41
+ ],
42
+
43
+ '0.7': [
44
+ 'bert',
45
+ 'dlrm',
46
+ 'gnmt',
47
+ 'maskrcnn',
48
+ 'minigo',
49
+ 'resnet',
50
+ 'ssd',
51
+ 'transformer'
52
+ ],
53
+ '1.0': [
54
+ 'bert',
55
+ 'dlrm',
56
+ 'maskrcnn',
57
+ 'minigo',
58
+ 'resnet',
59
+ 'ssd',
60
+ 'rnnt',
61
+ 'unet3d',
62
+ ],
63
+ '1.1': [
64
+ 'bert',
65
+ 'dlrm',
66
+ 'maskrcnn',
67
+ 'minigo',
68
+ 'resnet',
69
+ 'ssd',
70
+ 'rnnt',
71
+ 'unet3d',
72
+ ],
73
+ '2.0': [
74
+ 'bert',
75
+ 'dlrm',
76
+ 'maskrcnn',
77
+ 'minigo',
78
+ 'resnet',
79
+ 'ssd',
80
+ 'rnnt',
81
+ 'unet3d',
82
+ ],
83
+ '2.1': [
84
+ 'bert',
85
+ 'dlrm',
86
+ 'maskrcnn',
87
+ 'minigo',
88
+ 'resnet',
89
+ 'ssd',
90
+ 'rnnt',
91
+ 'unet3d',
92
+ ],
93
+ '3.0': [
94
+ 'bert',
95
+ 'dlrm_dcnv2',
96
+ 'gpt3',
97
+ 'maskrcnn',
98
+ 'resnet',
99
+ 'ssd',
100
+ 'rnnt',
101
+ 'unet3d',
102
+ ],
103
+ '3.1': [
104
+ 'bert',
105
+ 'dlrm_dcnv2',
106
+ 'gpt3',
107
+ 'maskrcnn',
108
+ 'resnet',
109
+ 'ssd',
110
+ 'rnnt',
111
+ 'unet3d',
112
+ 'stable_diffusion'
113
+ ],
114
+ '4.0': [
115
+ 'bert',
116
+ 'dlrm_dcnv2',
117
+ 'gpt3',
118
+ 'resnet',
119
+ 'ssd',
120
+ 'unet3d',
121
+ 'stable_diffusion',
122
+ 'llama2_70b_lora',
123
+ 'stable_diffusion',
124
+ 'gnn'
125
+ ],
126
+ '4.1': [
127
+ 'bert',
128
+ 'dlrm_dcnv2',
129
+ 'gpt3',
130
+ 'ssd',
131
+ 'stable_diffusion',
132
+ 'llama2_70b_lora',
133
+ 'gnn'
134
+ ]
135
+ },
136
+
137
+ 'hpc': {
138
+ '0.7': [
139
+ 'cosmoflow',
140
+ 'deepcam',
141
+ ],
142
+
143
+ '1.0': [
144
+ 'cosmoflow',
145
+ 'deepcam',
146
+ 'oc20',
147
+ ],
148
+ '2.0': [
149
+ 'cosmoflow',
150
+ 'deepcam',
151
+ 'oc20',
152
+ ],
153
+ '3.0': [
154
+ 'cosmoflow',
155
+ 'deepcam',
156
+ 'oc20',
157
+ 'openfold',
158
+ ],
159
+ }
160
+ }
161
+
162
+
163
+ def get_allowed_benchmarks(usage, ruleset):
164
+ # check usage
165
+ if usage not in _ALL_ALLOWED_BENCHMARKS:
166
+ raise ValueError('usage {} not supported!'.format(usage))
167
+
168
+ # check ruleset
169
+ if ruleset not in _ALL_ALLOWED_BENCHMARKS[usage]:
170
+ # try short version:
171
+ ruleset_short = ".".join(ruleset.split(".")[:-1])
172
+ if ruleset_short not in _ALL_ALLOWED_BENCHMARKS[usage]:
173
+ raise ValueError('ruleset {} is not supported in {}'.format(ruleset, usage))
174
+ allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset_short]
175
+ else:
176
+ allowed_benchmarks = _ALL_ALLOWED_BENCHMARKS[usage][ruleset]
177
+
178
+ return allowed_benchmarks
179
+
180
+
181
+ def get_result_file_counts(usage):
182
+ if usage not in _ALL_RESULT_FILE_COUNTS:
183
+ raise ValueError('usage {} not supported!'.format(usage))
184
+ return _ALL_RESULT_FILE_COUNTS[usage]
File without changes
@@ -0,0 +1,32 @@
1
+ import sys
2
+ import logging
3
+
4
+ from . import mlp_compliance
5
+
6
+ parser = mlp_compliance.get_parser()
7
+ args = parser.parse_args()
8
+
9
+ logging.basicConfig(filename=args.log_output, level=logging.INFO)
10
+ logging.getLogger().addHandler(logging.StreamHandler())
11
+ formatter = logging.Formatter("%(levelname)s - %(message)s")
12
+ logging.getLogger().handlers[0].setFormatter(formatter)
13
+ logging.getLogger().handlers[1].setFormatter(formatter)
14
+
15
+ config_file = args.config or f'{args.usage}_{args.ruleset}/common.yaml'
16
+
17
+ checker = mlp_compliance.make_checker(
18
+ args.usage,
19
+ args.ruleset,
20
+ args.quiet,
21
+ args.werror,
22
+ )
23
+
24
+ valid, system_id, benchmark, result = mlp_compliance.main(args.filename, config_file, checker)
25
+
26
+ if not valid:
27
+ logging.error('FAILED')
28
+ print('** Logging output also at', args.log_output)
29
+ sys.exit(1)
30
+ else:
31
+ print('** Logging output also at', args.log_output)
32
+ logging.info('SUCCESS')
@@ -0,0 +1,24 @@
1
+
2
+ - KEY:
3
+ NAME: submission_benchmark
4
+ REQ: EXACTLY_ONE
5
+ CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
6
+ POST: " enqueue_config('hpc_1.0.0/closed_{}.yaml'.format(v['value'])) "
7
+
8
+ - KEY:
9
+ NAME: gradient_accumulation_steps
10
+ CHECK: " v['value'] > 0 "
11
+
12
+ - KEY:
13
+ NAME: gradient_accumulation_frequency
14
+ CHECK: " v['value'] > 0 "
15
+
16
+ - KEY:
17
+ NAME: number_of_nodes
18
+ REQ: EXACTLY_ONE
19
+ CHECK: " v['value'] > 0"
20
+
21
+ - KEY:
22
+ NAME: accelerators_per_node
23
+ REQ: EXACTLY_ONE
24
+ CHECK: " v['value'] >= 0"
@@ -0,0 +1,47 @@
1
+ - KEY:
2
+ NAME: global_batch_size
3
+ REQ: EXACTLY_ONE
4
+ CHECK: " v['value'] > 0"
5
+
6
+ - KEY:
7
+ NAME: opt_name
8
+ REQ: EXACTLY_ONE
9
+ CHECK: " v['value'] in ['sgd', 'SGD'] "
10
+
11
+ - KEY:
12
+ NAME: opt_base_learning_rate
13
+ REQ: EXACTLY_ONE
14
+ CHECK: " v['value'] >= 0."
15
+
16
+ - KEY:
17
+ NAME: opt_learning_rate_warmup_epochs
18
+ REQ: EXACTLY_ONE
19
+ CHECK: " v['value'] >= 0"
20
+
21
+ - KEY:
22
+ NAME: opt_learning_rate_warmup_factor
23
+ REQ: EXACTLY_ONE
24
+ CHECK: " v['value'] >= 0."
25
+
26
+ - KEY:
27
+ NAME: opt_learning_rate_decay_boundary_epochs
28
+ REQ: EXACTLY_ONE
29
+
30
+ - KEY:
31
+ NAME: opt_learning_rate_decay_factor
32
+ REQ: EXACTLY_ONE
33
+
34
+ - KEY:
35
+ NAME: dropout
36
+ CHECK: " v['value'] >= 0. and v['value'] < 1."
37
+
38
+ - KEY:
39
+ NAME: opt_weight_decay
40
+ CHECK: " v['value'] >= 0."
41
+
42
+ - KEY:
43
+ NAME: eval_error
44
+ REQ: AT_LEAST_ONE
45
+ CHECK:
46
+ - "'epoch_num' in v['metadata']"
47
+ ATLEAST_ONE_CHECK: "v['value'] <= 0.124 and v['value'] > 0."
@@ -0,0 +1,80 @@
1
+ # General Settings
2
+ - KEY:
3
+ NAME: gradient_accumulation_frequency
4
+ REQ: EXACTLY_ONE
5
+ CHECK: " v['value'] > 0 "
6
+
7
+ - KEY:
8
+ NAME: seed
9
+ REQ: EXACTLY_ONE
10
+ CHECK: " v['value'] > 0"
11
+
12
+ - KEY:
13
+ NAME: global_batch_size
14
+ REQ: EXACTLY_ONE
15
+ CHECK: " v['value'] > 0"
16
+
17
+ - KEY:
18
+ NAME: batchnorm_group_size
19
+ REQ: EXACTLY_ONE
20
+ CHECK: " v['value'] > 0"
21
+
22
+
23
+ # Optimizer Parameters
24
+ - KEY:
25
+ NAME: opt_name
26
+ REQ: EXACTLY_ONE
27
+ CHECK: " v['value'] in ['Adam', 'AdamW', 'LAMB']"
28
+ POST: " if (v['value'] == 'LAMB'): enqueue_config('hpc_1.0.0/closed_deepcam_lamb.yaml') "
29
+
30
+ - KEY:
31
+ NAME: opt_lr
32
+ REQ: EXACTLY_ONE
33
+ CHECK: " v['value'] >0."
34
+
35
+ - KEY:
36
+ NAME: opt_betas
37
+ REQ: EXACTLY_ONE
38
+ CHECK: " len(v['value']) == 2"
39
+
40
+ - KEY:
41
+ NAME: opt_eps
42
+ REQ: EXACTLY_ONE
43
+ CHECK: " math.isclose(v['value'], 1e-6)"
44
+
45
+
46
+ # Scheduler Parameters
47
+ - KEY:
48
+ NAME: scheduler_type
49
+ REQ: EXACTLY_ONE
50
+ CHECK: " v['value'] in ['multistep', 'cosine_annealing']"
51
+ POST: " enqueue_config('hpc_1.0.0/closed_deepcam_{}.yaml'.format(v['value'].lower())) "
52
+
53
+ - KEY:
54
+ NAME: scheduler_lr_warmup_steps
55
+ REQ: EXACTLY_ONE
56
+ CHECK: " v['value'] >= 0 "
57
+
58
+ - KEY:
59
+ NAME: scheduler_lr_warmup_factor
60
+ REQ: EXACTLY_ONE
61
+ CHECK: " v['value'] >= 1. "
62
+
63
+ # Dataset Properties
64
+ - KEY:
65
+ NAME: train_samples
66
+ REQ: EXACTLY_ONE
67
+ CHECK: " v['value'] == 121266"
68
+
69
+ - KEY:
70
+ NAME: eval_samples
71
+ REQ: EXACTLY_ONE
72
+ CHECK: " v['value'] == 15158"
73
+
74
+ # Convergence Properties
75
+ - KEY:
76
+ NAME: eval_accuracy
77
+ REQ: AT_LEAST_ONE
78
+ CHECK:
79
+ - "'epoch_num' in v['metadata']"
80
+ ATLEAST_ONE_CHECK: "v['value'] >= 0.82 and v['value'] <= 1."
@@ -0,0 +1,10 @@
1
+ # Scheduler Parameters
2
+ - KEY:
3
+ NAME: scheduler_t_max
4
+ REQ: EXACTLY_ONE
5
+ CHECK: " v['value'] >= 1. "
6
+
7
+ - KEY:
8
+ NAME: scheduler_eta_min
9
+ REQ: EXACTLY_ONE
10
+ CHECK: " v['value'] >= 0. "
@@ -0,0 +1,16 @@
1
+ # Optimizer Parameters
2
+ - KEY:
3
+ NAME: opt_bias_correction
4
+ REQ: EXACTLY_ONE
5
+ CHECK: " v['value'] "
6
+
7
+ - KEY:
8
+ NAME: opt_grad_averaging
9
+ REQ: EXACTLY_ONE
10
+ CHECK: " v['value'] "
11
+
12
+ - KEY:
13
+ NAME: opt_max_grad_norm
14
+ REQ: EXACTLY_ONE
15
+ CHECK: " v['value'] == 1."
16
+
@@ -0,0 +1,10 @@
1
+ # Scheduler Parameters
2
+ - KEY:
3
+ NAME: scheduler_milestones
4
+ REQ: EXACTLY_ONE
5
+ CHECK: " len(v['value']) >= 0 "
6
+
7
+ - KEY:
8
+ NAME: scheduler_decay_rate
9
+ REQ: EXACTLY_ONE
10
+ CHECK: " v['value'] <= 1. "
@@ -0,0 +1,39 @@
1
+ - KEY:
2
+ NAME: global_batch_size
3
+ REQ: EXACTLY_ONE
4
+ CHECK: " v['value'] > 0"
5
+
6
+ - KEY:
7
+ NAME: opt_name
8
+ REQ: EXACTLY_ONE
9
+ CHECK: " v['value'] == 'AdamW'"
10
+
11
+ - KEY:
12
+ NAME: opt_base_learning_rate
13
+ REQ: EXACTLY_ONE
14
+ CHECK: " v['value'] >= 0."
15
+
16
+ - KEY:
17
+ NAME: opt_learning_rate_warmup_steps
18
+ REQ: EXACTLY_ONE
19
+ CHECK: " v['value'] >= 0"
20
+
21
+ - KEY:
22
+ NAME: opt_learning_rate_warmup_factor
23
+ REQ: EXACTLY_ONE
24
+ CHECK: " v['value'] >= 0."
25
+
26
+ - KEY:
27
+ NAME: opt_learning_rate_decay_boundary_steps
28
+ REQ: EXACTLY_ONE
29
+
30
+ - KEY:
31
+ NAME: opt_learning_rate_decay_factor
32
+ REQ: EXACTLY_ONE
33
+
34
+ - KEY:
35
+ NAME: eval_error
36
+ REQ: AT_LEAST_ONE
37
+ CHECK:
38
+ - "'epoch_num' in v['metadata']"
39
+ ATLEAST_ONE_CHECK: "v['value'] <= 0.036 and v['value'] > 0."
@@ -0,0 +1,156 @@
1
+ # This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules.
2
+ # In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line.
3
+ # The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'.
4
+ # Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will
5
+ # be executed before any checks.
6
+ # In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will
7
+ # be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported.
8
+ #
9
+ # KEY record:
10
+ # NAME
11
+ # REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE}
12
+ # PRE - optional - code to be executed before CHECK
13
+ # CHECK - optional - expression to be evaluated to verify correctness
14
+ # POST - optional - code to be executed after CHECK
15
+
16
+ - BEGIN:
17
+ CODE: >
18
+ s.update({
19
+ 'init_started': False,
20
+ 'init_stopped' : False,
21
+ 'run_started' : False,
22
+ 'run_stopped' : False,
23
+ 'in_epoch' : False,
24
+ 'last_epoch' : 0,
25
+ 'in_block' : False,
26
+ 'block_first_epoch' : -1,
27
+ 'first_init_start': 9e99,
28
+ })
29
+
30
+ - KEY:
31
+ NAME: submission_org
32
+ REQ: EXACTLY_ONE
33
+ CHECK: " v['value'] != '' "
34
+
35
+ - KEY:
36
+ NAME: submission_platform
37
+ REQ: EXACTLY_ONE
38
+ CHECK: " v['value'] != '' "
39
+
40
+ - KEY:
41
+ NAME: submission_division
42
+ REQ: EXACTLY_ONE
43
+ CHECK: " v['value'] in ['closed', 'open'] "
44
+ POST: " enqueue_config('hpc_1.0.0/{}_common.yaml'.format(v['value'])) "
45
+
46
+ - KEY:
47
+ NAME: submission_status
48
+ REQ: EXACTLY_ONE
49
+ CHECK: " v['value'] in ['cloud', 'onprem', 'preview', 'research'] "
50
+
51
+ # at least one record should be found, but any found records must pass the test
52
+ - KEY:
53
+ NAME: cache_clear
54
+ REQ: AT_LEAST_ONE
55
+ CHECK:
56
+ - "'value' in v"
57
+
58
+ # frequency not checked
59
+ - KEY:
60
+ NAME: init_start
61
+ REQ: AT_LEAST_ONE
62
+ CHECK:
63
+ - "not s['init_stopped']"
64
+ - "not s['run_started']"
65
+ POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) "
66
+
67
+ # confirm less than 20min since the very first init_start
68
+ - KEY:
69
+ NAME: init_stop
70
+ REQ: EXACTLY_ONE
71
+ CHECK:
72
+ - "s['init_started']"
73
+ - "not s['run_started']"
74
+ - "ll.timestamp - s['first_init_start'] < (30*60*1e3)"
75
+ POST: " s['init_stopped'] = True"
76
+
77
+ # HPC requires data staging to be included in run timing
78
+ - KEY:
79
+ NAME: staging_start
80
+ CHECK:
81
+ - "s['run_started']"
82
+ POST: " s['staging_started'] = True "
83
+
84
+ - KEY:
85
+ NAME: staging_stop
86
+ CHECK:
87
+ - "s['staging_started']"
88
+
89
+ # run start and run stop
90
+ - KEY:
91
+ NAME: run_start
92
+ REQ: EXACTLY_ONE
93
+ CHECK: " ( s['init_stopped'] == True )"
94
+ POST: " s['run_started'] = True "
95
+
96
+ # status can also be aborted, but not allowing it here for now
97
+ # if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok
98
+ - KEY:
99
+ NAME: run_stop
100
+ REQ: EXACTLY_ONE
101
+ CHECK:
102
+ - "s['run_started']"
103
+ - "'status' in v['metadata']"
104
+ POST: " s['run_stopped'] = True "
105
+
106
+ # FIXME: check epoch_count value match
107
+ - KEY:
108
+ NAME: block_start
109
+ REQ: AT_LEAST_ONE_OR(epoch_start)
110
+ CHECK:
111
+ - "s['run_started']"
112
+ - "'epoch_count' in v['metadata']"
113
+ - "'first_epoch_num' in v['metadata']"
114
+ - "v['metadata']['epoch_count'] > 0"
115
+
116
+ - KEY:
117
+ NAME: block_stop
118
+ REQ: AT_LEAST_ONE_OR(epoch_stop)
119
+ CHECK:
120
+ - "'first_epoch_num' in v['metadata']"
121
+
122
+ - KEY:
123
+ NAME: epoch_start
124
+ REQ: AT_LEAST_ONE_OR(block_start)
125
+ CHECK:
126
+ - "'epoch_num' in v['metadata']"
127
+
128
+ - KEY:
129
+ NAME: epoch_stop
130
+ REQ: AT_LEAST_ONE_OR(block_stop)
131
+ CHECK:
132
+ - "'epoch_num' in v['metadata']"
133
+
134
+ # making sure previous eval did print it's accuracy result
135
+ - KEY:
136
+ NAME: eval_start
137
+ REQ: AT_LEAST_ONE_OR(block_start)
138
+ CHECK:
139
+ - "'epoch_num' in v['metadata']"
140
+
141
+ - KEY:
142
+ NAME: eval_stop
143
+ REQ: AT_LEAST_ONE_OR(block_stop)
144
+ CHECK:
145
+ - "'epoch_num' in v['metadata']"
146
+
147
+ - KEY:
148
+ NAME: train_samples
149
+ REQ: EXACTLY_ONE
150
+ CHECK: " v['value'] != '' "
151
+
152
+ - KEY:
153
+ NAME: eval_samples
154
+ REQ: EXACTLY_ONE
155
+ CHECK: " v['value'] != '' "
156
+
@@ -0,0 +1,5 @@
1
+ - KEY:
2
+ NAME: submission_benchmark
3
+ REQ: EXACTLY_ONE
4
+ CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
5
+ POST: " enqueue_config('hpc_1.0.0/open_{}.yaml'.format(v['value'])) "
@@ -0,0 +1,6 @@
1
+ - KEY:
2
+ NAME: eval_error
3
+ REQ: AT_LEAST_ONE
4
+ CHECK:
5
+ - "'epoch_num' in v['metadata']"
6
+ ATLEAST_ONE_CHECK: "v['value'] <= 0.124 and v['value'] > 0."
@@ -0,0 +1,6 @@
1
+ - KEY:
2
+ NAME: eval_accuracy
3
+ REQ: AT_LEAST_ONE
4
+ CHECK:
5
+ - "'epoch_num' in v['metadata']"
6
+ ATLEAST_ONE_CHECK: "v['value'] >= 0.82 and v['value'] <= 1."
@@ -0,0 +1,6 @@
1
+ - KEY:
2
+ NAME: eval_error
3
+ REQ: AT_LEAST_ONE
4
+ CHECK:
5
+ - "'epoch_num' in v['metadata']"
6
+ ATLEAST_ONE_CHECK: "v['value'] <= 0.036 and v['value'] > 0."
@@ -0,0 +1,24 @@
1
+
2
+ - KEY:
3
+ NAME: submission_benchmark
4
+ REQ: EXACTLY_ONE
5
+ CHECK: " v['value'] in ['deepcam', 'cosmoflow', 'oc20'] "
6
+ POST: " enqueue_config('hpc_2.0.0/closed_{}.yaml'.format(v['value'])) "
7
+
8
+ - KEY:
9
+ NAME: gradient_accumulation_steps
10
+ CHECK: " v['value'] > 0 "
11
+
12
+ - KEY:
13
+ NAME: gradient_accumulation_frequency
14
+ CHECK: " v['value'] > 0 "
15
+
16
+ - KEY:
17
+ NAME: number_of_nodes
18
+ REQ: EXACTLY_ONE
19
+ CHECK: " v['value'] > 0"
20
+
21
+ - KEY:
22
+ NAME: accelerators_per_node
23
+ REQ: EXACTLY_ONE
24
+ CHECK: " v['value'] >= 0"