wandb 0.22.1__py3-none-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (919) hide show
  1. package_readme.md +97 -0
  2. wandb/__init__.py +248 -0
  3. wandb/__init__.pyi +1230 -0
  4. wandb/__main__.py +3 -0
  5. wandb/_analytics.py +65 -0
  6. wandb/_iterutils.py +73 -0
  7. wandb/_pydantic/__init__.py +30 -0
  8. wandb/_pydantic/base.py +108 -0
  9. wandb/_pydantic/field_types.py +29 -0
  10. wandb/_pydantic/utils.py +80 -0
  11. wandb/_pydantic/v1_compat.py +301 -0
  12. wandb/_strutils.py +40 -0
  13. wandb/agents/__init__.py +0 -0
  14. wandb/agents/pyagent.py +386 -0
  15. wandb/analytics/__init__.py +3 -0
  16. wandb/analytics/sentry.py +267 -0
  17. wandb/apis/__init__.py +50 -0
  18. wandb/apis/attrs.py +52 -0
  19. wandb/apis/importers/__init__.py +1 -0
  20. wandb/apis/importers/internals/internal.py +375 -0
  21. wandb/apis/importers/internals/protocols.py +103 -0
  22. wandb/apis/importers/internals/util.py +78 -0
  23. wandb/apis/importers/mlflow.py +254 -0
  24. wandb/apis/importers/validation.py +108 -0
  25. wandb/apis/importers/wandb.py +1608 -0
  26. wandb/apis/internal.py +241 -0
  27. wandb/apis/normalize.py +83 -0
  28. wandb/apis/paginator.py +138 -0
  29. wandb/apis/public/__init__.py +78 -0
  30. wandb/apis/public/api.py +2513 -0
  31. wandb/apis/public/artifacts.py +1050 -0
  32. wandb/apis/public/automations.py +86 -0
  33. wandb/apis/public/const.py +6 -0
  34. wandb/apis/public/files.py +411 -0
  35. wandb/apis/public/history.py +203 -0
  36. wandb/apis/public/integrations.py +203 -0
  37. wandb/apis/public/jobs.py +744 -0
  38. wandb/apis/public/projects.py +278 -0
  39. wandb/apis/public/query_generator.py +179 -0
  40. wandb/apis/public/registries/__init__.py +7 -0
  41. wandb/apis/public/registries/_freezable_list.py +176 -0
  42. wandb/apis/public/registries/_utils.py +139 -0
  43. wandb/apis/public/registries/registries_search.py +353 -0
  44. wandb/apis/public/registries/registry.py +370 -0
  45. wandb/apis/public/reports.py +597 -0
  46. wandb/apis/public/runs.py +1438 -0
  47. wandb/apis/public/sweeps.py +441 -0
  48. wandb/apis/public/teams.py +237 -0
  49. wandb/apis/public/users.py +179 -0
  50. wandb/apis/public/utils.py +211 -0
  51. wandb/apis/reports/__init__.py +1 -0
  52. wandb/apis/reports/v1/__init__.py +8 -0
  53. wandb/apis/reports/v2/__init__.py +8 -0
  54. wandb/apis/workspaces/__init__.py +8 -0
  55. wandb/automations/__init__.py +73 -0
  56. wandb/automations/_filters/__init__.py +40 -0
  57. wandb/automations/_filters/expressions.py +182 -0
  58. wandb/automations/_filters/operators.py +259 -0
  59. wandb/automations/_filters/run_metrics.py +330 -0
  60. wandb/automations/_generated/__init__.py +104 -0
  61. wandb/automations/_generated/create_automation.py +17 -0
  62. wandb/automations/_generated/create_generic_webhook_integration.py +37 -0
  63. wandb/automations/_generated/delete_automation.py +15 -0
  64. wandb/automations/_generated/enums.py +35 -0
  65. wandb/automations/_generated/fragments.py +293 -0
  66. wandb/automations/_generated/generic_webhook_integrations_by_entity.py +22 -0
  67. wandb/automations/_generated/get_automations.py +24 -0
  68. wandb/automations/_generated/get_automations_by_entity.py +26 -0
  69. wandb/automations/_generated/input_types.py +104 -0
  70. wandb/automations/_generated/integrations_by_entity.py +22 -0
  71. wandb/automations/_generated/operations.py +647 -0
  72. wandb/automations/_generated/slack_integrations_by_entity.py +22 -0
  73. wandb/automations/_generated/update_automation.py +17 -0
  74. wandb/automations/_utils.py +235 -0
  75. wandb/automations/_validators.py +185 -0
  76. wandb/automations/actions.py +220 -0
  77. wandb/automations/automations.py +85 -0
  78. wandb/automations/events.py +284 -0
  79. wandb/automations/integrations.py +45 -0
  80. wandb/automations/scopes.py +78 -0
  81. wandb/beta/workflows.py +324 -0
  82. wandb/bin/gpu_stats.exe +0 -0
  83. wandb/bin/wandb-core +0 -0
  84. wandb/cli/__init__.py +0 -0
  85. wandb/cli/beta.py +93 -0
  86. wandb/cli/beta_sync.py +224 -0
  87. wandb/cli/cli.py +2883 -0
  88. wandb/data_types.py +66 -0
  89. wandb/docker/__init__.py +290 -0
  90. wandb/docker/names.py +40 -0
  91. wandb/docker/wandb-entrypoint.sh +33 -0
  92. wandb/env.py +535 -0
  93. wandb/errors/__init__.py +17 -0
  94. wandb/errors/errors.py +40 -0
  95. wandb/errors/links.py +73 -0
  96. wandb/errors/term.py +415 -0
  97. wandb/errors/util.py +57 -0
  98. wandb/errors/warnings.py +2 -0
  99. wandb/filesync/__init__.py +0 -0
  100. wandb/filesync/dir_watcher.py +404 -0
  101. wandb/filesync/stats.py +100 -0
  102. wandb/filesync/step_checksum.py +142 -0
  103. wandb/filesync/step_prepare.py +179 -0
  104. wandb/filesync/step_upload.py +287 -0
  105. wandb/filesync/upload_job.py +142 -0
  106. wandb/integration/__init__.py +0 -0
  107. wandb/integration/catboost/__init__.py +5 -0
  108. wandb/integration/catboost/catboost.py +182 -0
  109. wandb/integration/cohere/__init__.py +3 -0
  110. wandb/integration/cohere/cohere.py +21 -0
  111. wandb/integration/cohere/resolver.py +347 -0
  112. wandb/integration/diffusers/__init__.py +3 -0
  113. wandb/integration/diffusers/autologger.py +76 -0
  114. wandb/integration/diffusers/pipeline_resolver.py +50 -0
  115. wandb/integration/diffusers/resolvers/__init__.py +9 -0
  116. wandb/integration/diffusers/resolvers/multimodal.py +881 -0
  117. wandb/integration/diffusers/resolvers/utils.py +102 -0
  118. wandb/integration/dspy/__init__.py +5 -0
  119. wandb/integration/dspy/dspy.py +422 -0
  120. wandb/integration/fastai/__init__.py +243 -0
  121. wandb/integration/gym/__init__.py +98 -0
  122. wandb/integration/huggingface/__init__.py +3 -0
  123. wandb/integration/huggingface/huggingface.py +18 -0
  124. wandb/integration/huggingface/resolver.py +213 -0
  125. wandb/integration/keras/__init__.py +11 -0
  126. wandb/integration/keras/callbacks/__init__.py +5 -0
  127. wandb/integration/keras/callbacks/metrics_logger.py +129 -0
  128. wandb/integration/keras/callbacks/model_checkpoint.py +188 -0
  129. wandb/integration/keras/callbacks/tables_builder.py +228 -0
  130. wandb/integration/keras/keras.py +1086 -0
  131. wandb/integration/kfp/__init__.py +6 -0
  132. wandb/integration/kfp/helpers.py +28 -0
  133. wandb/integration/kfp/kfp_patch.py +335 -0
  134. wandb/integration/kfp/wandb_logging.py +182 -0
  135. wandb/integration/langchain/__init__.py +3 -0
  136. wandb/integration/langchain/wandb_tracer.py +49 -0
  137. wandb/integration/lightgbm/__init__.py +239 -0
  138. wandb/integration/lightning/__init__.py +0 -0
  139. wandb/integration/lightning/fabric/__init__.py +3 -0
  140. wandb/integration/lightning/fabric/logger.py +763 -0
  141. wandb/integration/metaflow/__init__.py +9 -0
  142. wandb/integration/metaflow/data_pandas.py +74 -0
  143. wandb/integration/metaflow/data_pytorch.py +75 -0
  144. wandb/integration/metaflow/data_sklearn.py +76 -0
  145. wandb/integration/metaflow/errors.py +13 -0
  146. wandb/integration/metaflow/metaflow.py +327 -0
  147. wandb/integration/openai/__init__.py +3 -0
  148. wandb/integration/openai/fine_tuning.py +480 -0
  149. wandb/integration/openai/openai.py +22 -0
  150. wandb/integration/openai/resolver.py +240 -0
  151. wandb/integration/prodigy/__init__.py +3 -0
  152. wandb/integration/prodigy/prodigy.py +291 -0
  153. wandb/integration/sacred/__init__.py +117 -0
  154. wandb/integration/sagemaker/__init__.py +14 -0
  155. wandb/integration/sagemaker/auth.py +29 -0
  156. wandb/integration/sagemaker/config.py +58 -0
  157. wandb/integration/sagemaker/files.py +2 -0
  158. wandb/integration/sagemaker/resources.py +63 -0
  159. wandb/integration/sb3/__init__.py +3 -0
  160. wandb/integration/sb3/sb3.py +147 -0
  161. wandb/integration/sklearn/__init__.py +37 -0
  162. wandb/integration/sklearn/calculate/__init__.py +32 -0
  163. wandb/integration/sklearn/calculate/calibration_curves.py +125 -0
  164. wandb/integration/sklearn/calculate/class_proportions.py +68 -0
  165. wandb/integration/sklearn/calculate/confusion_matrix.py +93 -0
  166. wandb/integration/sklearn/calculate/decision_boundaries.py +40 -0
  167. wandb/integration/sklearn/calculate/elbow_curve.py +55 -0
  168. wandb/integration/sklearn/calculate/feature_importances.py +67 -0
  169. wandb/integration/sklearn/calculate/learning_curve.py +64 -0
  170. wandb/integration/sklearn/calculate/outlier_candidates.py +69 -0
  171. wandb/integration/sklearn/calculate/residuals.py +86 -0
  172. wandb/integration/sklearn/calculate/silhouette.py +118 -0
  173. wandb/integration/sklearn/calculate/summary_metrics.py +62 -0
  174. wandb/integration/sklearn/plot/__init__.py +35 -0
  175. wandb/integration/sklearn/plot/classifier.py +329 -0
  176. wandb/integration/sklearn/plot/clusterer.py +146 -0
  177. wandb/integration/sklearn/plot/regressor.py +121 -0
  178. wandb/integration/sklearn/plot/shared.py +91 -0
  179. wandb/integration/sklearn/utils.py +184 -0
  180. wandb/integration/tensorboard/__init__.py +10 -0
  181. wandb/integration/tensorboard/log.py +351 -0
  182. wandb/integration/tensorboard/monkeypatch.py +186 -0
  183. wandb/integration/tensorflow/__init__.py +5 -0
  184. wandb/integration/tensorflow/estimator_hook.py +54 -0
  185. wandb/integration/torch/__init__.py +0 -0
  186. wandb/integration/torch/wandb_torch.py +554 -0
  187. wandb/integration/ultralytics/__init__.py +11 -0
  188. wandb/integration/ultralytics/bbox_utils.py +215 -0
  189. wandb/integration/ultralytics/callback.py +528 -0
  190. wandb/integration/ultralytics/classification_utils.py +83 -0
  191. wandb/integration/ultralytics/mask_utils.py +202 -0
  192. wandb/integration/ultralytics/pose_utils.py +103 -0
  193. wandb/integration/weave/__init__.py +6 -0
  194. wandb/integration/weave/interface.py +49 -0
  195. wandb/integration/weave/weave.py +118 -0
  196. wandb/integration/xgboost/__init__.py +11 -0
  197. wandb/integration/xgboost/xgboost.py +189 -0
  198. wandb/integration/yolov8/__init__.py +0 -0
  199. wandb/integration/yolov8/yolov8.py +284 -0
  200. wandb/jupyter.py +538 -0
  201. wandb/mpmain/__init__.py +0 -0
  202. wandb/mpmain/__main__.py +1 -0
  203. wandb/old/__init__.py +0 -0
  204. wandb/old/core.py +53 -0
  205. wandb/old/settings.py +176 -0
  206. wandb/old/summary.py +438 -0
  207. wandb/plot/__init__.py +30 -0
  208. wandb/plot/bar.py +71 -0
  209. wandb/plot/confusion_matrix.py +185 -0
  210. wandb/plot/custom_chart.py +147 -0
  211. wandb/plot/histogram.py +66 -0
  212. wandb/plot/line.py +75 -0
  213. wandb/plot/line_series.py +173 -0
  214. wandb/plot/pr_curve.py +186 -0
  215. wandb/plot/roc_curve.py +163 -0
  216. wandb/plot/scatter.py +66 -0
  217. wandb/plot/utils.py +184 -0
  218. wandb/plot/viz.py +41 -0
  219. wandb/proto/__init__.py +0 -0
  220. wandb/proto/v3/__init__.py +0 -0
  221. wandb/proto/v3/wandb_base_pb2.py +55 -0
  222. wandb/proto/v3/wandb_internal_pb2.py +1738 -0
  223. wandb/proto/v3/wandb_server_pb2.py +209 -0
  224. wandb/proto/v3/wandb_settings_pb2.py +122 -0
  225. wandb/proto/v3/wandb_sync_pb2.py +100 -0
  226. wandb/proto/v3/wandb_telemetry_pb2.py +106 -0
  227. wandb/proto/v4/__init__.py +0 -0
  228. wandb/proto/v4/wandb_base_pb2.py +30 -0
  229. wandb/proto/v4/wandb_internal_pb2.py +384 -0
  230. wandb/proto/v4/wandb_server_pb2.py +64 -0
  231. wandb/proto/v4/wandb_settings_pb2.py +47 -0
  232. wandb/proto/v4/wandb_sync_pb2.py +42 -0
  233. wandb/proto/v4/wandb_telemetry_pb2.py +41 -0
  234. wandb/proto/v5/wandb_base_pb2.py +31 -0
  235. wandb/proto/v5/wandb_internal_pb2.py +385 -0
  236. wandb/proto/v5/wandb_server_pb2.py +65 -0
  237. wandb/proto/v5/wandb_settings_pb2.py +48 -0
  238. wandb/proto/v5/wandb_sync_pb2.py +43 -0
  239. wandb/proto/v5/wandb_telemetry_pb2.py +42 -0
  240. wandb/proto/v6/wandb_base_pb2.py +41 -0
  241. wandb/proto/v6/wandb_internal_pb2.py +395 -0
  242. wandb/proto/v6/wandb_server_pb2.py +75 -0
  243. wandb/proto/v6/wandb_settings_pb2.py +58 -0
  244. wandb/proto/v6/wandb_sync_pb2.py +53 -0
  245. wandb/proto/v6/wandb_telemetry_pb2.py +52 -0
  246. wandb/proto/wandb_base_pb2.py +12 -0
  247. wandb/proto/wandb_deprecated.py +59 -0
  248. wandb/proto/wandb_generate_deprecated.py +30 -0
  249. wandb/proto/wandb_generate_proto.py +50 -0
  250. wandb/proto/wandb_internal_pb2.py +18 -0
  251. wandb/proto/wandb_server_pb2.py +12 -0
  252. wandb/proto/wandb_settings_pb2.py +12 -0
  253. wandb/proto/wandb_sync_pb2.py +12 -0
  254. wandb/proto/wandb_telemetry_pb2.py +12 -0
  255. wandb/py.typed +0 -0
  256. wandb/sdk/__init__.py +37 -0
  257. wandb/sdk/artifacts/__init__.py +0 -0
  258. wandb/sdk/artifacts/_factories.py +22 -0
  259. wandb/sdk/artifacts/_generated/__init__.py +208 -0
  260. wandb/sdk/artifacts/_generated/add_aliases.py +21 -0
  261. wandb/sdk/artifacts/_generated/artifact_by_id.py +17 -0
  262. wandb/sdk/artifacts/_generated/artifact_by_name.py +22 -0
  263. wandb/sdk/artifacts/_generated/artifact_collection_membership_file_urls.py +43 -0
  264. wandb/sdk/artifacts/_generated/artifact_collection_membership_files.py +43 -0
  265. wandb/sdk/artifacts/_generated/artifact_created_by.py +47 -0
  266. wandb/sdk/artifacts/_generated/artifact_file_urls.py +22 -0
  267. wandb/sdk/artifacts/_generated/artifact_type.py +31 -0
  268. wandb/sdk/artifacts/_generated/artifact_used_by.py +43 -0
  269. wandb/sdk/artifacts/_generated/artifact_version_files.py +36 -0
  270. wandb/sdk/artifacts/_generated/artifact_via_membership_by_name.py +26 -0
  271. wandb/sdk/artifacts/_generated/create_artifact_collection_tag_assignments.py +36 -0
  272. wandb/sdk/artifacts/_generated/delete_aliases.py +21 -0
  273. wandb/sdk/artifacts/_generated/delete_artifact.py +28 -0
  274. wandb/sdk/artifacts/_generated/delete_artifact_collection_tag_assignments.py +25 -0
  275. wandb/sdk/artifacts/_generated/delete_artifact_portfolio.py +35 -0
  276. wandb/sdk/artifacts/_generated/delete_artifact_sequence.py +35 -0
  277. wandb/sdk/artifacts/_generated/enums.py +22 -0
  278. wandb/sdk/artifacts/_generated/fetch_artifact_manifest.py +38 -0
  279. wandb/sdk/artifacts/_generated/fetch_linked_artifacts.py +67 -0
  280. wandb/sdk/artifacts/_generated/fetch_registries.py +32 -0
  281. wandb/sdk/artifacts/_generated/fragments.py +524 -0
  282. wandb/sdk/artifacts/_generated/input_types.py +46 -0
  283. wandb/sdk/artifacts/_generated/link_artifact.py +27 -0
  284. wandb/sdk/artifacts/_generated/move_artifact_collection.py +35 -0
  285. wandb/sdk/artifacts/_generated/operations.py +1253 -0
  286. wandb/sdk/artifacts/_generated/project_artifact_collection.py +101 -0
  287. wandb/sdk/artifacts/_generated/project_artifact_collections.py +33 -0
  288. wandb/sdk/artifacts/_generated/project_artifact_type.py +24 -0
  289. wandb/sdk/artifacts/_generated/project_artifact_types.py +24 -0
  290. wandb/sdk/artifacts/_generated/project_artifacts.py +42 -0
  291. wandb/sdk/artifacts/_generated/registry_collections.py +34 -0
  292. wandb/sdk/artifacts/_generated/registry_versions.py +34 -0
  293. wandb/sdk/artifacts/_generated/run_input_artifacts.py +31 -0
  294. wandb/sdk/artifacts/_generated/run_output_artifacts.py +31 -0
  295. wandb/sdk/artifacts/_generated/type_info.py +19 -0
  296. wandb/sdk/artifacts/_generated/unlink_artifact.py +25 -0
  297. wandb/sdk/artifacts/_generated/update_artifact.py +26 -0
  298. wandb/sdk/artifacts/_generated/update_artifact_portfolio.py +35 -0
  299. wandb/sdk/artifacts/_generated/update_artifact_sequence.py +35 -0
  300. wandb/sdk/artifacts/_gqlutils.py +47 -0
  301. wandb/sdk/artifacts/_internal_artifact.py +54 -0
  302. wandb/sdk/artifacts/_models/__init__.py +4 -0
  303. wandb/sdk/artifacts/_models/base_model.py +20 -0
  304. wandb/sdk/artifacts/_validators.py +338 -0
  305. wandb/sdk/artifacts/artifact.py +2683 -0
  306. wandb/sdk/artifacts/artifact_download_logger.py +45 -0
  307. wandb/sdk/artifacts/artifact_file_cache.py +256 -0
  308. wandb/sdk/artifacts/artifact_instance_cache.py +17 -0
  309. wandb/sdk/artifacts/artifact_manifest.py +76 -0
  310. wandb/sdk/artifacts/artifact_manifest_entry.py +315 -0
  311. wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
  312. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +94 -0
  313. wandb/sdk/artifacts/artifact_saver.py +277 -0
  314. wandb/sdk/artifacts/artifact_state.py +13 -0
  315. wandb/sdk/artifacts/artifact_ttl.py +9 -0
  316. wandb/sdk/artifacts/exceptions.py +72 -0
  317. wandb/sdk/artifacts/staging.py +27 -0
  318. wandb/sdk/artifacts/storage_handler.py +62 -0
  319. wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
  320. wandb/sdk/artifacts/storage_handlers/azure_handler.py +214 -0
  321. wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
  322. wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
  323. wandb/sdk/artifacts/storage_handlers/local_file_handler.py +142 -0
  324. wandb/sdk/artifacts/storage_handlers/multi_handler.py +56 -0
  325. wandb/sdk/artifacts/storage_handlers/s3_handler.py +340 -0
  326. wandb/sdk/artifacts/storage_handlers/tracking_handler.py +68 -0
  327. wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +131 -0
  328. wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +74 -0
  329. wandb/sdk/artifacts/storage_layout.py +8 -0
  330. wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
  331. wandb/sdk/artifacts/storage_policies/_factories.py +63 -0
  332. wandb/sdk/artifacts/storage_policies/register.py +1 -0
  333. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +525 -0
  334. wandb/sdk/artifacts/storage_policy.py +75 -0
  335. wandb/sdk/backend/__init__.py +0 -0
  336. wandb/sdk/backend/backend.py +57 -0
  337. wandb/sdk/data_types/__init__.py +0 -0
  338. wandb/sdk/data_types/_dtypes.py +914 -0
  339. wandb/sdk/data_types/_private.py +10 -0
  340. wandb/sdk/data_types/audio.py +208 -0
  341. wandb/sdk/data_types/base_types/__init__.py +0 -0
  342. wandb/sdk/data_types/base_types/json_metadata.py +55 -0
  343. wandb/sdk/data_types/base_types/media.py +339 -0
  344. wandb/sdk/data_types/base_types/wb_value.py +295 -0
  345. wandb/sdk/data_types/bokeh.py +91 -0
  346. wandb/sdk/data_types/graph.py +439 -0
  347. wandb/sdk/data_types/helper_types/__init__.py +0 -0
  348. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +327 -0
  349. wandb/sdk/data_types/helper_types/classes.py +159 -0
  350. wandb/sdk/data_types/helper_types/image_mask.py +251 -0
  351. wandb/sdk/data_types/histogram.py +107 -0
  352. wandb/sdk/data_types/html.py +165 -0
  353. wandb/sdk/data_types/image.py +985 -0
  354. wandb/sdk/data_types/molecule.py +250 -0
  355. wandb/sdk/data_types/object_3d.py +495 -0
  356. wandb/sdk/data_types/plotly.py +95 -0
  357. wandb/sdk/data_types/saved_model.py +435 -0
  358. wandb/sdk/data_types/table.py +1468 -0
  359. wandb/sdk/data_types/table_decorators.py +108 -0
  360. wandb/sdk/data_types/trace_tree.py +440 -0
  361. wandb/sdk/data_types/utils.py +260 -0
  362. wandb/sdk/data_types/video.py +303 -0
  363. wandb/sdk/integration_utils/__init__.py +0 -0
  364. wandb/sdk/integration_utils/auto_logging.py +232 -0
  365. wandb/sdk/integration_utils/data_logging.py +475 -0
  366. wandb/sdk/interface/__init__.py +0 -0
  367. wandb/sdk/interface/constants.py +4 -0
  368. wandb/sdk/interface/interface.py +1093 -0
  369. wandb/sdk/interface/interface_queue.py +50 -0
  370. wandb/sdk/interface/interface_shared.py +473 -0
  371. wandb/sdk/interface/interface_sock.py +55 -0
  372. wandb/sdk/interface/summary_record.py +67 -0
  373. wandb/sdk/internal/__init__.py +0 -0
  374. wandb/sdk/internal/_generated/__init__.py +5 -0
  375. wandb/sdk/internal/_generated/enums.py +4 -0
  376. wandb/sdk/internal/_generated/input_types.py +4 -0
  377. wandb/sdk/internal/_generated/operations.py +15 -0
  378. wandb/sdk/internal/_generated/server_features_query.py +27 -0
  379. wandb/sdk/internal/context.py +89 -0
  380. wandb/sdk/internal/datastore.py +293 -0
  381. wandb/sdk/internal/file_pusher.py +177 -0
  382. wandb/sdk/internal/file_stream.py +686 -0
  383. wandb/sdk/internal/handler.py +854 -0
  384. wandb/sdk/internal/incremental_table_util.py +53 -0
  385. wandb/sdk/internal/internal_api.py +4723 -0
  386. wandb/sdk/internal/job_builder.py +639 -0
  387. wandb/sdk/internal/profiler.py +79 -0
  388. wandb/sdk/internal/progress.py +77 -0
  389. wandb/sdk/internal/run.py +27 -0
  390. wandb/sdk/internal/sample.py +70 -0
  391. wandb/sdk/internal/sender.py +1692 -0
  392. wandb/sdk/internal/sender_config.py +203 -0
  393. wandb/sdk/internal/settings_static.py +40 -0
  394. wandb/sdk/internal/tb_watcher.py +519 -0
  395. wandb/sdk/internal/thread_local_settings.py +18 -0
  396. wandb/sdk/launch/__init__.py +15 -0
  397. wandb/sdk/launch/_launch.py +331 -0
  398. wandb/sdk/launch/_launch_add.py +255 -0
  399. wandb/sdk/launch/_project_spec.py +565 -0
  400. wandb/sdk/launch/agent/__init__.py +5 -0
  401. wandb/sdk/launch/agent/agent.py +931 -0
  402. wandb/sdk/launch/agent/config.py +296 -0
  403. wandb/sdk/launch/agent/job_status_tracker.py +55 -0
  404. wandb/sdk/launch/agent/run_queue_item_file_saver.py +39 -0
  405. wandb/sdk/launch/builder/__init__.py +0 -0
  406. wandb/sdk/launch/builder/abstract.py +156 -0
  407. wandb/sdk/launch/builder/build.py +296 -0
  408. wandb/sdk/launch/builder/context_manager.py +235 -0
  409. wandb/sdk/launch/builder/docker_builder.py +177 -0
  410. wandb/sdk/launch/builder/kaniko_builder.py +595 -0
  411. wandb/sdk/launch/builder/noop.py +58 -0
  412. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +188 -0
  413. wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
  414. wandb/sdk/launch/create_job.py +541 -0
  415. wandb/sdk/launch/environment/abstract.py +29 -0
  416. wandb/sdk/launch/environment/aws_environment.py +322 -0
  417. wandb/sdk/launch/environment/azure_environment.py +105 -0
  418. wandb/sdk/launch/environment/gcp_environment.py +334 -0
  419. wandb/sdk/launch/environment/local_environment.py +65 -0
  420. wandb/sdk/launch/errors.py +13 -0
  421. wandb/sdk/launch/git_reference.py +109 -0
  422. wandb/sdk/launch/inputs/files.py +148 -0
  423. wandb/sdk/launch/inputs/internal.py +315 -0
  424. wandb/sdk/launch/inputs/manage.py +113 -0
  425. wandb/sdk/launch/inputs/schema.py +70 -0
  426. wandb/sdk/launch/loader.py +249 -0
  427. wandb/sdk/launch/registry/abstract.py +48 -0
  428. wandb/sdk/launch/registry/anon.py +29 -0
  429. wandb/sdk/launch/registry/azure_container_registry.py +124 -0
  430. wandb/sdk/launch/registry/elastic_container_registry.py +192 -0
  431. wandb/sdk/launch/registry/google_artifact_registry.py +219 -0
  432. wandb/sdk/launch/registry/local_registry.py +65 -0
  433. wandb/sdk/launch/runner/__init__.py +0 -0
  434. wandb/sdk/launch/runner/abstract.py +185 -0
  435. wandb/sdk/launch/runner/kubernetes_monitor.py +473 -0
  436. wandb/sdk/launch/runner/kubernetes_runner.py +1290 -0
  437. wandb/sdk/launch/runner/local_container.py +301 -0
  438. wandb/sdk/launch/runner/local_process.py +78 -0
  439. wandb/sdk/launch/runner/sagemaker_runner.py +424 -0
  440. wandb/sdk/launch/runner/vertex_runner.py +225 -0
  441. wandb/sdk/launch/sweeps/__init__.py +37 -0
  442. wandb/sdk/launch/sweeps/scheduler.py +739 -0
  443. wandb/sdk/launch/sweeps/scheduler_sweep.py +90 -0
  444. wandb/sdk/launch/sweeps/utils.py +324 -0
  445. wandb/sdk/launch/utils.py +827 -0
  446. wandb/sdk/launch/wandb_reference.py +138 -0
  447. wandb/sdk/lib/__init__.py +5 -0
  448. wandb/sdk/lib/apikey.py +334 -0
  449. wandb/sdk/lib/asyncio_compat.py +278 -0
  450. wandb/sdk/lib/asyncio_manager.py +252 -0
  451. wandb/sdk/lib/capped_dict.py +26 -0
  452. wandb/sdk/lib/config_util.py +101 -0
  453. wandb/sdk/lib/console_capture.py +219 -0
  454. wandb/sdk/lib/credentials.py +141 -0
  455. wandb/sdk/lib/deprecate.py +27 -0
  456. wandb/sdk/lib/disabled.py +30 -0
  457. wandb/sdk/lib/exit_hooks.py +54 -0
  458. wandb/sdk/lib/file_stream_utils.py +118 -0
  459. wandb/sdk/lib/filenames.py +64 -0
  460. wandb/sdk/lib/filesystem.py +372 -0
  461. wandb/sdk/lib/fsm.py +165 -0
  462. wandb/sdk/lib/gitlib.py +240 -0
  463. wandb/sdk/lib/gql_request.py +76 -0
  464. wandb/sdk/lib/handler_util.py +21 -0
  465. wandb/sdk/lib/hashutil.py +106 -0
  466. wandb/sdk/lib/import_hooks.py +275 -0
  467. wandb/sdk/lib/interrupt.py +37 -0
  468. wandb/sdk/lib/ipython.py +126 -0
  469. wandb/sdk/lib/json_util.py +75 -0
  470. wandb/sdk/lib/lazyloader.py +63 -0
  471. wandb/sdk/lib/module.py +72 -0
  472. wandb/sdk/lib/paths.py +108 -0
  473. wandb/sdk/lib/preinit.py +42 -0
  474. wandb/sdk/lib/printer.py +567 -0
  475. wandb/sdk/lib/printer_asyncio.py +48 -0
  476. wandb/sdk/lib/progress.py +325 -0
  477. wandb/sdk/lib/proto_util.py +90 -0
  478. wandb/sdk/lib/redirect.py +876 -0
  479. wandb/sdk/lib/retry.py +395 -0
  480. wandb/sdk/lib/run_moment.py +82 -0
  481. wandb/sdk/lib/runid.py +12 -0
  482. wandb/sdk/lib/server.py +58 -0
  483. wandb/sdk/lib/service/ipc_support.py +13 -0
  484. wandb/sdk/lib/service/service_client.py +102 -0
  485. wandb/sdk/lib/service/service_connection.py +238 -0
  486. wandb/sdk/lib/service/service_port_file.py +105 -0
  487. wandb/sdk/lib/service/service_process.py +111 -0
  488. wandb/sdk/lib/service/service_token.py +181 -0
  489. wandb/sdk/lib/sparkline.py +44 -0
  490. wandb/sdk/lib/telemetry.py +100 -0
  491. wandb/sdk/lib/timed_input.py +133 -0
  492. wandb/sdk/lib/timer.py +19 -0
  493. wandb/sdk/lib/wb_logging.py +161 -0
  494. wandb/sdk/mailbox/__init__.py +23 -0
  495. wandb/sdk/mailbox/mailbox.py +143 -0
  496. wandb/sdk/mailbox/mailbox_handle.py +134 -0
  497. wandb/sdk/mailbox/response_handle.py +99 -0
  498. wandb/sdk/mailbox/wait_with_progress.py +100 -0
  499. wandb/sdk/projects/_generated/__init__.py +26 -0
  500. wandb/sdk/projects/_generated/delete_project.py +22 -0
  501. wandb/sdk/projects/_generated/enums.py +4 -0
  502. wandb/sdk/projects/_generated/fetch_registry.py +22 -0
  503. wandb/sdk/projects/_generated/fragments.py +41 -0
  504. wandb/sdk/projects/_generated/input_types.py +13 -0
  505. wandb/sdk/projects/_generated/operations.py +88 -0
  506. wandb/sdk/projects/_generated/rename_project.py +27 -0
  507. wandb/sdk/projects/_generated/upsert_registry_project.py +27 -0
  508. wandb/sdk/verify/__init__.py +0 -0
  509. wandb/sdk/verify/verify.py +555 -0
  510. wandb/sdk/wandb_alerts.py +12 -0
  511. wandb/sdk/wandb_config.py +323 -0
  512. wandb/sdk/wandb_helper.py +54 -0
  513. wandb/sdk/wandb_init.py +1601 -0
  514. wandb/sdk/wandb_login.py +358 -0
  515. wandb/sdk/wandb_metric.py +112 -0
  516. wandb/sdk/wandb_require.py +88 -0
  517. wandb/sdk/wandb_require_helpers.py +44 -0
  518. wandb/sdk/wandb_run.py +4102 -0
  519. wandb/sdk/wandb_settings.py +2197 -0
  520. wandb/sdk/wandb_setup.py +560 -0
  521. wandb/sdk/wandb_summary.py +150 -0
  522. wandb/sdk/wandb_sweep.py +120 -0
  523. wandb/sdk/wandb_sync.py +71 -0
  524. wandb/sdk/wandb_watch.py +146 -0
  525. wandb/sklearn.py +35 -0
  526. wandb/sync/__init__.py +3 -0
  527. wandb/sync/sync.py +457 -0
  528. wandb/trigger.py +29 -0
  529. wandb/util.py +2040 -0
  530. wandb/vendor/__init__.py +0 -0
  531. wandb/vendor/gql-0.2.0/setup.py +40 -0
  532. wandb/vendor/gql-0.2.0/tests/__init__.py +0 -0
  533. wandb/vendor/gql-0.2.0/tests/starwars/__init__.py +0 -0
  534. wandb/vendor/gql-0.2.0/tests/starwars/fixtures.py +96 -0
  535. wandb/vendor/gql-0.2.0/tests/starwars/schema.py +146 -0
  536. wandb/vendor/gql-0.2.0/tests/starwars/test_dsl.py +293 -0
  537. wandb/vendor/gql-0.2.0/tests/starwars/test_query.py +355 -0
  538. wandb/vendor/gql-0.2.0/tests/starwars/test_validation.py +171 -0
  539. wandb/vendor/gql-0.2.0/tests/test_client.py +31 -0
  540. wandb/vendor/gql-0.2.0/tests/test_transport.py +89 -0
  541. wandb/vendor/gql-0.2.0/wandb_gql/__init__.py +4 -0
  542. wandb/vendor/gql-0.2.0/wandb_gql/client.py +75 -0
  543. wandb/vendor/gql-0.2.0/wandb_gql/dsl.py +152 -0
  544. wandb/vendor/gql-0.2.0/wandb_gql/gql.py +10 -0
  545. wandb/vendor/gql-0.2.0/wandb_gql/transport/__init__.py +0 -0
  546. wandb/vendor/gql-0.2.0/wandb_gql/transport/http.py +6 -0
  547. wandb/vendor/gql-0.2.0/wandb_gql/transport/local_schema.py +15 -0
  548. wandb/vendor/gql-0.2.0/wandb_gql/transport/requests.py +46 -0
  549. wandb/vendor/gql-0.2.0/wandb_gql/utils.py +21 -0
  550. wandb/vendor/graphql-core-1.1/setup.py +86 -0
  551. wandb/vendor/graphql-core-1.1/wandb_graphql/__init__.py +287 -0
  552. wandb/vendor/graphql-core-1.1/wandb_graphql/error/__init__.py +6 -0
  553. wandb/vendor/graphql-core-1.1/wandb_graphql/error/base.py +42 -0
  554. wandb/vendor/graphql-core-1.1/wandb_graphql/error/format_error.py +11 -0
  555. wandb/vendor/graphql-core-1.1/wandb_graphql/error/located_error.py +29 -0
  556. wandb/vendor/graphql-core-1.1/wandb_graphql/error/syntax_error.py +36 -0
  557. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/__init__.py +26 -0
  558. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/base.py +311 -0
  559. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executor.py +398 -0
  560. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/__init__.py +0 -0
  561. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/asyncio.py +53 -0
  562. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/gevent.py +22 -0
  563. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/process.py +32 -0
  564. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/sync.py +7 -0
  565. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/thread.py +35 -0
  566. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/utils.py +6 -0
  567. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/__init__.py +0 -0
  568. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/executor.py +66 -0
  569. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/fragment.py +252 -0
  570. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/resolver.py +151 -0
  571. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/utils.py +7 -0
  572. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/middleware.py +57 -0
  573. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/values.py +145 -0
  574. wandb/vendor/graphql-core-1.1/wandb_graphql/graphql.py +60 -0
  575. wandb/vendor/graphql-core-1.1/wandb_graphql/language/__init__.py +0 -0
  576. wandb/vendor/graphql-core-1.1/wandb_graphql/language/ast.py +1349 -0
  577. wandb/vendor/graphql-core-1.1/wandb_graphql/language/base.py +19 -0
  578. wandb/vendor/graphql-core-1.1/wandb_graphql/language/lexer.py +435 -0
  579. wandb/vendor/graphql-core-1.1/wandb_graphql/language/location.py +30 -0
  580. wandb/vendor/graphql-core-1.1/wandb_graphql/language/parser.py +779 -0
  581. wandb/vendor/graphql-core-1.1/wandb_graphql/language/printer.py +193 -0
  582. wandb/vendor/graphql-core-1.1/wandb_graphql/language/source.py +18 -0
  583. wandb/vendor/graphql-core-1.1/wandb_graphql/language/visitor.py +222 -0
  584. wandb/vendor/graphql-core-1.1/wandb_graphql/language/visitor_meta.py +82 -0
  585. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/__init__.py +0 -0
  586. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/cached_property.py +17 -0
  587. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/contain_subset.py +28 -0
  588. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/default_ordered_dict.py +40 -0
  589. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/ordereddict.py +8 -0
  590. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/pair_set.py +43 -0
  591. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/version.py +78 -0
  592. wandb/vendor/graphql-core-1.1/wandb_graphql/type/__init__.py +67 -0
  593. wandb/vendor/graphql-core-1.1/wandb_graphql/type/definition.py +619 -0
  594. wandb/vendor/graphql-core-1.1/wandb_graphql/type/directives.py +132 -0
  595. wandb/vendor/graphql-core-1.1/wandb_graphql/type/introspection.py +440 -0
  596. wandb/vendor/graphql-core-1.1/wandb_graphql/type/scalars.py +131 -0
  597. wandb/vendor/graphql-core-1.1/wandb_graphql/type/schema.py +100 -0
  598. wandb/vendor/graphql-core-1.1/wandb_graphql/type/typemap.py +145 -0
  599. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/__init__.py +0 -0
  600. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/assert_valid_name.py +9 -0
  601. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_from_value.py +65 -0
  602. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_to_code.py +49 -0
  603. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_to_dict.py +24 -0
  604. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/base.py +75 -0
  605. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/build_ast_schema.py +291 -0
  606. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/build_client_schema.py +250 -0
  607. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/concat_ast.py +9 -0
  608. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/extend_schema.py +357 -0
  609. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/get_field_def.py +27 -0
  610. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/get_operation_ast.py +21 -0
  611. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/introspection_query.py +90 -0
  612. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/is_valid_literal_value.py +67 -0
  613. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/is_valid_value.py +66 -0
  614. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/quoted_or_list.py +21 -0
  615. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/schema_printer.py +168 -0
  616. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/suggestion_list.py +56 -0
  617. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_comparators.py +69 -0
  618. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_from_ast.py +21 -0
  619. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_info.py +149 -0
  620. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/value_from_ast.py +69 -0
  621. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/__init__.py +4 -0
  622. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/__init__.py +79 -0
  623. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/arguments_of_correct_type.py +24 -0
  624. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/base.py +8 -0
  625. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/default_values_of_correct_type.py +44 -0
  626. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/fields_on_correct_type.py +113 -0
  627. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/fragments_on_composite_types.py +33 -0
  628. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_argument_names.py +70 -0
  629. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_directives.py +97 -0
  630. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_fragment_names.py +19 -0
  631. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_type_names.py +43 -0
  632. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/lone_anonymous_operation.py +23 -0
  633. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_fragment_cycles.py +59 -0
  634. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_undefined_variables.py +36 -0
  635. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_unused_fragments.py +38 -0
  636. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_unused_variables.py +37 -0
  637. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/overlapping_fields_can_be_merged.py +529 -0
  638. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/possible_fragment_spreads.py +44 -0
  639. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/provided_non_null_arguments.py +46 -0
  640. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/scalar_leafs.py +33 -0
  641. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_argument_names.py +32 -0
  642. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_fragment_names.py +28 -0
  643. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_input_field_names.py +33 -0
  644. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_operation_names.py +31 -0
  645. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_variable_names.py +27 -0
  646. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/variables_are_input_types.py +21 -0
  647. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/variables_in_allowed_position.py +53 -0
  648. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/validation.py +158 -0
  649. wandb/vendor/promise-2.3.0/conftest.py +30 -0
  650. wandb/vendor/promise-2.3.0/setup.py +64 -0
  651. wandb/vendor/promise-2.3.0/tests/__init__.py +0 -0
  652. wandb/vendor/promise-2.3.0/tests/conftest.py +8 -0
  653. wandb/vendor/promise-2.3.0/tests/test_awaitable.py +32 -0
  654. wandb/vendor/promise-2.3.0/tests/test_awaitable_35.py +47 -0
  655. wandb/vendor/promise-2.3.0/tests/test_benchmark.py +116 -0
  656. wandb/vendor/promise-2.3.0/tests/test_complex_threads.py +23 -0
  657. wandb/vendor/promise-2.3.0/tests/test_dataloader.py +452 -0
  658. wandb/vendor/promise-2.3.0/tests/test_dataloader_awaitable_35.py +99 -0
  659. wandb/vendor/promise-2.3.0/tests/test_dataloader_extra.py +65 -0
  660. wandb/vendor/promise-2.3.0/tests/test_extra.py +670 -0
  661. wandb/vendor/promise-2.3.0/tests/test_issues.py +132 -0
  662. wandb/vendor/promise-2.3.0/tests/test_promise_list.py +70 -0
  663. wandb/vendor/promise-2.3.0/tests/test_spec.py +584 -0
  664. wandb/vendor/promise-2.3.0/tests/test_thread_safety.py +115 -0
  665. wandb/vendor/promise-2.3.0/tests/utils.py +3 -0
  666. wandb/vendor/promise-2.3.0/wandb_promise/__init__.py +38 -0
  667. wandb/vendor/promise-2.3.0/wandb_promise/async_.py +135 -0
  668. wandb/vendor/promise-2.3.0/wandb_promise/compat.py +32 -0
  669. wandb/vendor/promise-2.3.0/wandb_promise/dataloader.py +326 -0
  670. wandb/vendor/promise-2.3.0/wandb_promise/iterate_promise.py +12 -0
  671. wandb/vendor/promise-2.3.0/wandb_promise/promise.py +848 -0
  672. wandb/vendor/promise-2.3.0/wandb_promise/promise_list.py +151 -0
  673. wandb/vendor/promise-2.3.0/wandb_promise/pyutils/__init__.py +0 -0
  674. wandb/vendor/promise-2.3.0/wandb_promise/pyutils/version.py +83 -0
  675. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/__init__.py +0 -0
  676. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/asyncio.py +22 -0
  677. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/gevent.py +21 -0
  678. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/immediate.py +27 -0
  679. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/thread.py +18 -0
  680. wandb/vendor/promise-2.3.0/wandb_promise/utils.py +56 -0
  681. wandb/vendor/pygments/__init__.py +90 -0
  682. wandb/vendor/pygments/cmdline.py +568 -0
  683. wandb/vendor/pygments/console.py +74 -0
  684. wandb/vendor/pygments/filter.py +74 -0
  685. wandb/vendor/pygments/filters/__init__.py +350 -0
  686. wandb/vendor/pygments/formatter.py +95 -0
  687. wandb/vendor/pygments/formatters/__init__.py +153 -0
  688. wandb/vendor/pygments/formatters/_mapping.py +85 -0
  689. wandb/vendor/pygments/formatters/bbcode.py +109 -0
  690. wandb/vendor/pygments/formatters/html.py +851 -0
  691. wandb/vendor/pygments/formatters/img.py +600 -0
  692. wandb/vendor/pygments/formatters/irc.py +182 -0
  693. wandb/vendor/pygments/formatters/latex.py +482 -0
  694. wandb/vendor/pygments/formatters/other.py +160 -0
  695. wandb/vendor/pygments/formatters/rtf.py +147 -0
  696. wandb/vendor/pygments/formatters/svg.py +153 -0
  697. wandb/vendor/pygments/formatters/terminal.py +136 -0
  698. wandb/vendor/pygments/formatters/terminal256.py +309 -0
  699. wandb/vendor/pygments/lexer.py +871 -0
  700. wandb/vendor/pygments/lexers/__init__.py +329 -0
  701. wandb/vendor/pygments/lexers/_asy_builtins.py +1645 -0
  702. wandb/vendor/pygments/lexers/_cl_builtins.py +232 -0
  703. wandb/vendor/pygments/lexers/_cocoa_builtins.py +72 -0
  704. wandb/vendor/pygments/lexers/_csound_builtins.py +1346 -0
  705. wandb/vendor/pygments/lexers/_lasso_builtins.py +5327 -0
  706. wandb/vendor/pygments/lexers/_lua_builtins.py +295 -0
  707. wandb/vendor/pygments/lexers/_mapping.py +500 -0
  708. wandb/vendor/pygments/lexers/_mql_builtins.py +1172 -0
  709. wandb/vendor/pygments/lexers/_openedge_builtins.py +2547 -0
  710. wandb/vendor/pygments/lexers/_php_builtins.py +4756 -0
  711. wandb/vendor/pygments/lexers/_postgres_builtins.py +621 -0
  712. wandb/vendor/pygments/lexers/_scilab_builtins.py +3094 -0
  713. wandb/vendor/pygments/lexers/_sourcemod_builtins.py +1163 -0
  714. wandb/vendor/pygments/lexers/_stan_builtins.py +532 -0
  715. wandb/vendor/pygments/lexers/_stata_builtins.py +419 -0
  716. wandb/vendor/pygments/lexers/_tsql_builtins.py +1004 -0
  717. wandb/vendor/pygments/lexers/_vim_builtins.py +1939 -0
  718. wandb/vendor/pygments/lexers/actionscript.py +240 -0
  719. wandb/vendor/pygments/lexers/agile.py +24 -0
  720. wandb/vendor/pygments/lexers/algebra.py +221 -0
  721. wandb/vendor/pygments/lexers/ambient.py +76 -0
  722. wandb/vendor/pygments/lexers/ampl.py +87 -0
  723. wandb/vendor/pygments/lexers/apl.py +101 -0
  724. wandb/vendor/pygments/lexers/archetype.py +318 -0
  725. wandb/vendor/pygments/lexers/asm.py +641 -0
  726. wandb/vendor/pygments/lexers/automation.py +374 -0
  727. wandb/vendor/pygments/lexers/basic.py +500 -0
  728. wandb/vendor/pygments/lexers/bibtex.py +160 -0
  729. wandb/vendor/pygments/lexers/business.py +612 -0
  730. wandb/vendor/pygments/lexers/c_cpp.py +252 -0
  731. wandb/vendor/pygments/lexers/c_like.py +541 -0
  732. wandb/vendor/pygments/lexers/capnproto.py +78 -0
  733. wandb/vendor/pygments/lexers/chapel.py +102 -0
  734. wandb/vendor/pygments/lexers/clean.py +288 -0
  735. wandb/vendor/pygments/lexers/compiled.py +34 -0
  736. wandb/vendor/pygments/lexers/configs.py +833 -0
  737. wandb/vendor/pygments/lexers/console.py +114 -0
  738. wandb/vendor/pygments/lexers/crystal.py +393 -0
  739. wandb/vendor/pygments/lexers/csound.py +366 -0
  740. wandb/vendor/pygments/lexers/css.py +689 -0
  741. wandb/vendor/pygments/lexers/d.py +251 -0
  742. wandb/vendor/pygments/lexers/dalvik.py +125 -0
  743. wandb/vendor/pygments/lexers/data.py +555 -0
  744. wandb/vendor/pygments/lexers/diff.py +165 -0
  745. wandb/vendor/pygments/lexers/dotnet.py +691 -0
  746. wandb/vendor/pygments/lexers/dsls.py +878 -0
  747. wandb/vendor/pygments/lexers/dylan.py +289 -0
  748. wandb/vendor/pygments/lexers/ecl.py +125 -0
  749. wandb/vendor/pygments/lexers/eiffel.py +65 -0
  750. wandb/vendor/pygments/lexers/elm.py +121 -0
  751. wandb/vendor/pygments/lexers/erlang.py +533 -0
  752. wandb/vendor/pygments/lexers/esoteric.py +277 -0
  753. wandb/vendor/pygments/lexers/ezhil.py +69 -0
  754. wandb/vendor/pygments/lexers/factor.py +344 -0
  755. wandb/vendor/pygments/lexers/fantom.py +250 -0
  756. wandb/vendor/pygments/lexers/felix.py +273 -0
  757. wandb/vendor/pygments/lexers/forth.py +177 -0
  758. wandb/vendor/pygments/lexers/fortran.py +205 -0
  759. wandb/vendor/pygments/lexers/foxpro.py +428 -0
  760. wandb/vendor/pygments/lexers/functional.py +21 -0
  761. wandb/vendor/pygments/lexers/go.py +101 -0
  762. wandb/vendor/pygments/lexers/grammar_notation.py +213 -0
  763. wandb/vendor/pygments/lexers/graph.py +80 -0
  764. wandb/vendor/pygments/lexers/graphics.py +553 -0
  765. wandb/vendor/pygments/lexers/haskell.py +843 -0
  766. wandb/vendor/pygments/lexers/haxe.py +936 -0
  767. wandb/vendor/pygments/lexers/hdl.py +382 -0
  768. wandb/vendor/pygments/lexers/hexdump.py +103 -0
  769. wandb/vendor/pygments/lexers/html.py +602 -0
  770. wandb/vendor/pygments/lexers/idl.py +270 -0
  771. wandb/vendor/pygments/lexers/igor.py +288 -0
  772. wandb/vendor/pygments/lexers/inferno.py +96 -0
  773. wandb/vendor/pygments/lexers/installers.py +322 -0
  774. wandb/vendor/pygments/lexers/int_fiction.py +1343 -0
  775. wandb/vendor/pygments/lexers/iolang.py +63 -0
  776. wandb/vendor/pygments/lexers/j.py +146 -0
  777. wandb/vendor/pygments/lexers/javascript.py +1525 -0
  778. wandb/vendor/pygments/lexers/julia.py +333 -0
  779. wandb/vendor/pygments/lexers/jvm.py +1573 -0
  780. wandb/vendor/pygments/lexers/lisp.py +2621 -0
  781. wandb/vendor/pygments/lexers/make.py +202 -0
  782. wandb/vendor/pygments/lexers/markup.py +595 -0
  783. wandb/vendor/pygments/lexers/math.py +21 -0
  784. wandb/vendor/pygments/lexers/matlab.py +663 -0
  785. wandb/vendor/pygments/lexers/ml.py +769 -0
  786. wandb/vendor/pygments/lexers/modeling.py +358 -0
  787. wandb/vendor/pygments/lexers/modula2.py +1561 -0
  788. wandb/vendor/pygments/lexers/monte.py +204 -0
  789. wandb/vendor/pygments/lexers/ncl.py +894 -0
  790. wandb/vendor/pygments/lexers/nimrod.py +159 -0
  791. wandb/vendor/pygments/lexers/nit.py +64 -0
  792. wandb/vendor/pygments/lexers/nix.py +136 -0
  793. wandb/vendor/pygments/lexers/oberon.py +105 -0
  794. wandb/vendor/pygments/lexers/objective.py +504 -0
  795. wandb/vendor/pygments/lexers/ooc.py +85 -0
  796. wandb/vendor/pygments/lexers/other.py +41 -0
  797. wandb/vendor/pygments/lexers/parasail.py +79 -0
  798. wandb/vendor/pygments/lexers/parsers.py +835 -0
  799. wandb/vendor/pygments/lexers/pascal.py +644 -0
  800. wandb/vendor/pygments/lexers/pawn.py +199 -0
  801. wandb/vendor/pygments/lexers/perl.py +620 -0
  802. wandb/vendor/pygments/lexers/php.py +267 -0
  803. wandb/vendor/pygments/lexers/praat.py +294 -0
  804. wandb/vendor/pygments/lexers/prolog.py +306 -0
  805. wandb/vendor/pygments/lexers/python.py +939 -0
  806. wandb/vendor/pygments/lexers/qvt.py +152 -0
  807. wandb/vendor/pygments/lexers/r.py +453 -0
  808. wandb/vendor/pygments/lexers/rdf.py +270 -0
  809. wandb/vendor/pygments/lexers/rebol.py +431 -0
  810. wandb/vendor/pygments/lexers/resource.py +85 -0
  811. wandb/vendor/pygments/lexers/rnc.py +67 -0
  812. wandb/vendor/pygments/lexers/roboconf.py +82 -0
  813. wandb/vendor/pygments/lexers/robotframework.py +560 -0
  814. wandb/vendor/pygments/lexers/ruby.py +519 -0
  815. wandb/vendor/pygments/lexers/rust.py +220 -0
  816. wandb/vendor/pygments/lexers/sas.py +228 -0
  817. wandb/vendor/pygments/lexers/scripting.py +1222 -0
  818. wandb/vendor/pygments/lexers/shell.py +794 -0
  819. wandb/vendor/pygments/lexers/smalltalk.py +195 -0
  820. wandb/vendor/pygments/lexers/smv.py +79 -0
  821. wandb/vendor/pygments/lexers/snobol.py +83 -0
  822. wandb/vendor/pygments/lexers/special.py +103 -0
  823. wandb/vendor/pygments/lexers/sql.py +681 -0
  824. wandb/vendor/pygments/lexers/stata.py +108 -0
  825. wandb/vendor/pygments/lexers/supercollider.py +90 -0
  826. wandb/vendor/pygments/lexers/tcl.py +145 -0
  827. wandb/vendor/pygments/lexers/templates.py +2283 -0
  828. wandb/vendor/pygments/lexers/testing.py +207 -0
  829. wandb/vendor/pygments/lexers/text.py +25 -0
  830. wandb/vendor/pygments/lexers/textedit.py +169 -0
  831. wandb/vendor/pygments/lexers/textfmts.py +297 -0
  832. wandb/vendor/pygments/lexers/theorem.py +458 -0
  833. wandb/vendor/pygments/lexers/trafficscript.py +54 -0
  834. wandb/vendor/pygments/lexers/typoscript.py +226 -0
  835. wandb/vendor/pygments/lexers/urbi.py +133 -0
  836. wandb/vendor/pygments/lexers/varnish.py +190 -0
  837. wandb/vendor/pygments/lexers/verification.py +111 -0
  838. wandb/vendor/pygments/lexers/web.py +24 -0
  839. wandb/vendor/pygments/lexers/webmisc.py +988 -0
  840. wandb/vendor/pygments/lexers/whiley.py +116 -0
  841. wandb/vendor/pygments/lexers/x10.py +69 -0
  842. wandb/vendor/pygments/modeline.py +44 -0
  843. wandb/vendor/pygments/plugin.py +68 -0
  844. wandb/vendor/pygments/regexopt.py +92 -0
  845. wandb/vendor/pygments/scanner.py +105 -0
  846. wandb/vendor/pygments/sphinxext.py +158 -0
  847. wandb/vendor/pygments/style.py +155 -0
  848. wandb/vendor/pygments/styles/__init__.py +80 -0
  849. wandb/vendor/pygments/styles/abap.py +29 -0
  850. wandb/vendor/pygments/styles/algol.py +63 -0
  851. wandb/vendor/pygments/styles/algol_nu.py +63 -0
  852. wandb/vendor/pygments/styles/arduino.py +98 -0
  853. wandb/vendor/pygments/styles/autumn.py +65 -0
  854. wandb/vendor/pygments/styles/borland.py +51 -0
  855. wandb/vendor/pygments/styles/bw.py +49 -0
  856. wandb/vendor/pygments/styles/colorful.py +81 -0
  857. wandb/vendor/pygments/styles/default.py +73 -0
  858. wandb/vendor/pygments/styles/emacs.py +72 -0
  859. wandb/vendor/pygments/styles/friendly.py +72 -0
  860. wandb/vendor/pygments/styles/fruity.py +42 -0
  861. wandb/vendor/pygments/styles/igor.py +29 -0
  862. wandb/vendor/pygments/styles/lovelace.py +97 -0
  863. wandb/vendor/pygments/styles/manni.py +75 -0
  864. wandb/vendor/pygments/styles/monokai.py +106 -0
  865. wandb/vendor/pygments/styles/murphy.py +80 -0
  866. wandb/vendor/pygments/styles/native.py +65 -0
  867. wandb/vendor/pygments/styles/paraiso_dark.py +125 -0
  868. wandb/vendor/pygments/styles/paraiso_light.py +125 -0
  869. wandb/vendor/pygments/styles/pastie.py +75 -0
  870. wandb/vendor/pygments/styles/perldoc.py +69 -0
  871. wandb/vendor/pygments/styles/rainbow_dash.py +89 -0
  872. wandb/vendor/pygments/styles/rrt.py +33 -0
  873. wandb/vendor/pygments/styles/sas.py +44 -0
  874. wandb/vendor/pygments/styles/stata.py +40 -0
  875. wandb/vendor/pygments/styles/tango.py +141 -0
  876. wandb/vendor/pygments/styles/trac.py +63 -0
  877. wandb/vendor/pygments/styles/vim.py +63 -0
  878. wandb/vendor/pygments/styles/vs.py +38 -0
  879. wandb/vendor/pygments/styles/xcode.py +51 -0
  880. wandb/vendor/pygments/token.py +213 -0
  881. wandb/vendor/pygments/unistring.py +217 -0
  882. wandb/vendor/pygments/util.py +388 -0
  883. wandb/vendor/watchdog_0_9_0/wandb_watchdog/__init__.py +17 -0
  884. wandb/vendor/watchdog_0_9_0/wandb_watchdog/events.py +615 -0
  885. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/__init__.py +98 -0
  886. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/api.py +369 -0
  887. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/fsevents.py +172 -0
  888. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/fsevents2.py +239 -0
  889. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify.py +218 -0
  890. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify_buffer.py +81 -0
  891. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify_c.py +575 -0
  892. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/kqueue.py +730 -0
  893. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/polling.py +145 -0
  894. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/read_directory_changes.py +133 -0
  895. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/winapi.py +348 -0
  896. wandb/vendor/watchdog_0_9_0/wandb_watchdog/patterns.py +265 -0
  897. wandb/vendor/watchdog_0_9_0/wandb_watchdog/tricks/__init__.py +174 -0
  898. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/__init__.py +151 -0
  899. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/bricks.py +249 -0
  900. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/compat.py +29 -0
  901. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/decorators.py +198 -0
  902. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/delayed_queue.py +88 -0
  903. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/dirsnapshot.py +293 -0
  904. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/echo.py +157 -0
  905. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/event_backport.py +41 -0
  906. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/importlib2.py +40 -0
  907. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/platform.py +57 -0
  908. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/unicode_paths.py +64 -0
  909. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/win32stat.py +123 -0
  910. wandb/vendor/watchdog_0_9_0/wandb_watchdog/version.py +28 -0
  911. wandb/vendor/watchdog_0_9_0/wandb_watchdog/watchmedo.py +577 -0
  912. wandb/wandb_agent.py +611 -0
  913. wandb/wandb_controller.py +719 -0
  914. wandb/wandb_run.py +8 -0
  915. wandb-0.22.1.dist-info/METADATA +223 -0
  916. wandb-0.22.1.dist-info/RECORD +919 -0
  917. wandb-0.22.1.dist-info/WHEEL +4 -0
  918. wandb-0.22.1.dist-info/entry_points.txt +3 -0
  919. wandb-0.22.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1290 @@
1
+ """Implementation of KubernetesRunner class for wandb launch."""
2
+
3
+ import asyncio
4
+ import base64
5
+ import datetime
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
11
+
12
+ import yaml
13
+
14
+ import wandb
15
+ from wandb.apis.internal import Api
16
+ from wandb.sdk.launch.agent.agent import LaunchAgent
17
+ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
18
+ from wandb.sdk.launch.registry.abstract import AbstractRegistry
19
+ from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
20
+ from wandb.sdk.launch.registry.local_registry import LocalRegistry
21
+ from wandb.sdk.launch.runner.abstract import Status
22
+ from wandb.sdk.launch.runner.kubernetes_monitor import (
23
+ WANDB_K8S_LABEL_AGENT,
24
+ WANDB_K8S_LABEL_AUXILIARY_RESOURCE,
25
+ WANDB_K8S_LABEL_MONITOR,
26
+ WANDB_K8S_RUN_ID,
27
+ CustomResource,
28
+ LaunchKubernetesMonitor,
29
+ )
30
+ from wandb.sdk.launch.utils import (
31
+ recursive_macro_sub,
32
+ sanitize_identifiers_for_k8s,
33
+ yield_containers,
34
+ )
35
+ from wandb.sdk.lib.retry import ExponentialBackoff, retry_async
36
+ from wandb.util import get_module
37
+
38
+ from .._project_spec import EntryPoint, LaunchProject
39
+ from ..errors import LaunchError
40
+ from ..utils import (
41
+ CODE_MOUNT_DIR,
42
+ LOG_PREFIX,
43
+ MAX_ENV_LENGTHS,
44
+ PROJECT_SYNCHRONOUS,
45
+ get_kube_context_and_api_client,
46
+ make_k8s_label_safe,
47
+ make_name_dns_safe,
48
+ )
49
+ from .abstract import AbstractRun, AbstractRunner
50
+
51
+ get_module(
52
+ "kubernetes_asyncio",
53
+ required="Kubernetes runner requires the kubernetes package. Please install it with `pip install wandb[launch]`.",
54
+ )
55
+
56
+ import kubernetes_asyncio # type: ignore # noqa: E402
57
+ from kubernetes_asyncio import client # noqa: E402
58
+ from kubernetes_asyncio.client.api.apps_v1_api import ( # type: ignore # noqa: E402
59
+ AppsV1Api,
60
+ )
61
+ from kubernetes_asyncio.client.api.batch_v1_api import ( # type: ignore # noqa: E402
62
+ BatchV1Api,
63
+ )
64
+ from kubernetes_asyncio.client.api.core_v1_api import ( # type: ignore # noqa: E402
65
+ CoreV1Api,
66
+ )
67
+ from kubernetes_asyncio.client.api.custom_objects_api import ( # type: ignore # noqa: E402
68
+ CustomObjectsApi,
69
+ )
70
+ from kubernetes_asyncio.client.api.networking_v1_api import ( # type: ignore # noqa: E402
71
+ NetworkingV1Api,
72
+ )
73
+ from kubernetes_asyncio.client.models.v1_secret import ( # type: ignore # noqa: E402
74
+ V1Secret,
75
+ )
76
+ from kubernetes_asyncio.client.rest import ApiException # type: ignore # noqa: E402
77
+
78
+ TIMEOUT = 5
79
+ API_KEY_SECRET_MAX_RETRIES = 5
80
+
81
+ _logger = logging.getLogger(__name__)
82
+
83
+
84
+ SOURCE_CODE_PVC_MOUNT_PATH = os.environ.get("WANDB_LAUNCH_CODE_PVC_MOUNT_PATH")
85
+ SOURCE_CODE_PVC_NAME = os.environ.get("WANDB_LAUNCH_CODE_PVC_NAME")
86
+
87
+
88
+ class KubernetesSubmittedRun(AbstractRun):
89
+ """Wrapper for a launched run on Kubernetes."""
90
+
91
+ def __init__(
92
+ self,
93
+ batch_api: "BatchV1Api",
94
+ core_api: "CoreV1Api",
95
+ apps_api: "AppsV1Api",
96
+ network_api: "NetworkingV1Api",
97
+ name: str,
98
+ namespace: Optional[str] = "default",
99
+ secret: Optional["V1Secret"] = None,
100
+ auxiliary_resource_label_key: Optional[str] = None,
101
+ ) -> None:
102
+ """Initialize a KubernetesSubmittedRun.
103
+
104
+ Other implementations of the AbstractRun interface poll on the run
105
+ when `get_status` is called, but KubernetesSubmittedRun uses
106
+ Kubernetes watch streams to update the run status. One thread handles
107
+ events from the job object and another thread handles events from the
108
+ rank 0 pod. These threads updated the `_status` attributed of the
109
+ KubernetesSubmittedRun object. When `get_status` is called, the
110
+ `_status` attribute is returned.
111
+
112
+ Arguments:
113
+ batch_api: Kubernetes BatchV1Api object.
114
+ core_api: Kubernetes CoreV1Api object.
115
+ network_api: Kubernetes NetworkV1Api object.
116
+ name: Name of the job.
117
+ namespace: Kubernetes namespace.
118
+ secret: Kubernetes secret.
119
+
120
+ Returns:
121
+ None.
122
+ """
123
+ self.batch_api = batch_api
124
+ self.core_api = core_api
125
+ self.apps_api = apps_api
126
+ self.network_api = network_api
127
+ self.name = name
128
+ self.namespace = namespace
129
+ self._fail_count = 0
130
+ self.secret = secret
131
+ self.auxiliary_resource_label_key = auxiliary_resource_label_key
132
+
133
+ @property
134
+ def id(self) -> str:
135
+ """Return the run id."""
136
+ return self.name
137
+
138
+ async def get_logs(self) -> Optional[str]:
139
+ try:
140
+ pods = await self.core_api.list_namespaced_pod(
141
+ label_selector=f"job-name={self.name}", namespace=self.namespace
142
+ )
143
+ pod_names = [pi.metadata.name for pi in pods.items]
144
+ if not pod_names:
145
+ wandb.termwarn(f"Found no pods for kubernetes job: {self.name}")
146
+ return None
147
+ logs = await self.core_api.read_namespaced_pod_log(
148
+ name=pod_names[0], namespace=self.namespace
149
+ )
150
+ if logs:
151
+ return str(logs)
152
+ else:
153
+ wandb.termwarn(f"No logs for kubernetes pod(s): {pod_names}")
154
+ return None
155
+ except Exception as e:
156
+ wandb.termerror(f"{LOG_PREFIX}Failed to get pod logs: {e}")
157
+ return None
158
+
159
+ async def wait(self) -> bool:
160
+ """Wait for the run to finish.
161
+
162
+ Returns:
163
+ True if the run finished successfully, False otherwise.
164
+ """
165
+ while True:
166
+ status = await self.get_status()
167
+ wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status.state}")
168
+ if status.state in ["finished", "failed", "preempted"]:
169
+ break
170
+ await asyncio.sleep(5)
171
+
172
+ await self._delete_secret()
173
+ await self._delete_auxiliary_resources_by_label()
174
+ return (
175
+ status.state == "finished"
176
+ ) # todo: not sure if this (copied from aws runner) is the right approach? should we return false on failure
177
+
178
+ async def get_status(self) -> Status:
179
+ status = LaunchKubernetesMonitor.get_status(self.name)
180
+ if status in ["stopped", "failed", "finished", "preempted"]:
181
+ await self._delete_secret()
182
+ await self._delete_auxiliary_resources_by_label()
183
+ return status
184
+
185
+ async def cancel(self) -> None:
186
+ """Cancel the run."""
187
+ try:
188
+ await self.batch_api.delete_namespaced_job(
189
+ namespace=self.namespace,
190
+ name=self.name,
191
+ )
192
+ await self._delete_secret()
193
+ await self._delete_auxiliary_resources_by_label()
194
+ except ApiException as e:
195
+ raise LaunchError(
196
+ f"Failed to delete Kubernetes Job {self.name} in namespace {self.namespace}: {str(e)}"
197
+ ) from e
198
+
199
+ async def _delete_secret(self) -> None:
200
+ # Cleanup secret if not running in a helm-managed context
201
+ if not os.environ.get("WANDB_RELEASE_NAME") and self.secret:
202
+ await self.core_api.delete_namespaced_secret(
203
+ name=self.secret.metadata.name,
204
+ namespace=self.secret.metadata.namespace,
205
+ )
206
+ self.secret = None
207
+
208
+ async def _delete_auxiliary_resources_by_label(self) -> None:
209
+ if self.auxiliary_resource_label_key is None:
210
+ return
211
+
212
+ label_selector = (
213
+ f"{WANDB_K8S_LABEL_AUXILIARY_RESOURCE}={self.auxiliary_resource_label_key}"
214
+ )
215
+
216
+ try:
217
+ resource_cleanups = [
218
+ (self.core_api, "service"),
219
+ (self.batch_api, "job"),
220
+ (self.core_api, "pod"),
221
+ (self.core_api, "secret"),
222
+ (self.apps_api, "deployment"),
223
+ (self.network_api, "network_policy"),
224
+ ]
225
+
226
+ for api_client, resource_type in resource_cleanups:
227
+ try:
228
+ list_method = getattr(
229
+ api_client, f"list_namespaced_{resource_type}"
230
+ )
231
+ delete_method = getattr(
232
+ api_client, f"delete_namespaced_{resource_type}"
233
+ )
234
+
235
+ # List resources with our label
236
+ resources = await list_method(
237
+ namespace=self.namespace, label_selector=label_selector
238
+ )
239
+
240
+ # Delete each resource
241
+ for resource in resources.items:
242
+ await delete_method(
243
+ name=resource.metadata.name, namespace=self.namespace
244
+ )
245
+
246
+ except (AttributeError, ApiException) as e:
247
+ wandb.termwarn(f"Could not clean up {resource_type}: {e}")
248
+
249
+ except Exception as e:
250
+ wandb.termwarn(f"Failed to clean up some auxiliary resources: {e}")
251
+
252
+
253
+ class CrdSubmittedRun(AbstractRun):
254
+ """Run submitted to a CRD backend, e.g. Volcano."""
255
+
256
+ def __init__(
257
+ self,
258
+ group: str,
259
+ version: str,
260
+ plural: str,
261
+ name: str,
262
+ namespace: str,
263
+ core_api: CoreV1Api,
264
+ custom_api: CustomObjectsApi,
265
+ ) -> None:
266
+ """Create a run object for tracking the progress of a CRD.
267
+
268
+ Arguments:
269
+ group: The API group of the CRD.
270
+ version: The API version of the CRD.
271
+ plural: The plural name of the CRD.
272
+ name: The name of the CRD instance.
273
+ namespace: The namespace of the CRD instance.
274
+ core_api: The Kubernetes core API client.
275
+ custom_api: The Kubernetes custom object API client.
276
+
277
+ Raises:
278
+ LaunchError: If the CRD instance does not exist.
279
+ """
280
+ self.group = group
281
+ self.version = version
282
+ self.plural = plural
283
+ self.name = name
284
+ self.namespace = namespace
285
+ self.core_api = core_api
286
+ self.custom_api = custom_api
287
+ self._fail_count = 0
288
+
289
+ @property
290
+ def id(self) -> str:
291
+ """Get the name of the custom object."""
292
+ return self.name
293
+
294
+ async def get_logs(self) -> Optional[str]:
295
+ """Get logs for custom object."""
296
+ # TODO: test more carefully once we release multi-node support
297
+ logs: Dict[str, Optional[str]] = {}
298
+ try:
299
+ pods = await self.core_api.list_namespaced_pod(
300
+ label_selector=f"wandb/run-id={self.name}", namespace=self.namespace
301
+ )
302
+ pod_names = [pi.metadata.name for pi in pods.items]
303
+ for pod_name in pod_names:
304
+ logs[pod_name] = await self.core_api.read_namespaced_pod_log(
305
+ name=pod_name, namespace=self.namespace
306
+ )
307
+ except ApiException as e:
308
+ wandb.termwarn(f"Failed to get logs for {self.name}: {str(e)}")
309
+ return None
310
+ if not logs:
311
+ return None
312
+ logs_as_array = [f"Pod {pod_name}:\n{log}" for pod_name, log in logs.items()]
313
+ return "\n".join(logs_as_array)
314
+
315
+ async def get_status(self) -> Status:
316
+ """Get status of custom object."""
317
+ return LaunchKubernetesMonitor.get_status(self.name)
318
+
319
+ async def cancel(self) -> None:
320
+ """Cancel the custom object."""
321
+ try:
322
+ await self.custom_api.delete_namespaced_custom_object(
323
+ group=self.group,
324
+ version=self.version,
325
+ namespace=self.namespace,
326
+ plural=self.plural,
327
+ name=self.name,
328
+ )
329
+ except ApiException as e:
330
+ raise LaunchError(
331
+ f"Failed to delete CRD {self.name} in namespace {self.namespace}: {str(e)}"
332
+ ) from e
333
+
334
+ async def wait(self) -> bool:
335
+ """Wait for this custom object to finish running."""
336
+ while True:
337
+ status = await self.get_status()
338
+ wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status}")
339
+ if status.state in ["finished", "failed", "preempted"]:
340
+ return status.state == "finished"
341
+ await asyncio.sleep(5)
342
+
343
+
344
+ class KubernetesRunner(AbstractRunner):
345
+ """Launches runs onto kubernetes."""
346
+
347
+ def __init__(
348
+ self,
349
+ api: Api,
350
+ backend_config: Dict[str, Any],
351
+ environment: AbstractEnvironment,
352
+ registry: AbstractRegistry,
353
+ ) -> None:
354
+ """Create a Kubernetes runner.
355
+
356
+ Arguments:
357
+ api: The API client object.
358
+ backend_config: The backend configuration.
359
+ environment: The environment to launch runs into.
360
+
361
+ Raises:
362
+ LaunchError: If the Kubernetes configuration is invalid.
363
+ """
364
+ super().__init__(api, backend_config)
365
+ self.environment = environment
366
+ self.registry = registry
367
+
368
+ def get_namespace(
369
+ self, resource_args: Dict[str, Any], context: Dict[str, Any]
370
+ ) -> str:
371
+ """Get the namespace to launch into.
372
+
373
+ Arguments:
374
+ resource_args: The resource args to launch.
375
+ context: The k8s config context.
376
+
377
+ Returns:
378
+ The namespace to launch into.
379
+ """
380
+ default_namespace = (
381
+ context["context"].get("namespace", "default") if context else "default"
382
+ )
383
+ return ( # type: ignore[no-any-return]
384
+ resource_args.get("metadata", {}).get("namespace")
385
+ or resource_args.get(
386
+ "namespace"
387
+ ) # continue support for malformed namespace
388
+ or self.backend_config.get("runner", {}).get("namespace")
389
+ or default_namespace
390
+ )
391
+
392
+ async def _inject_defaults(
393
+ self,
394
+ resource_args: Dict[str, Any],
395
+ launch_project: LaunchProject,
396
+ image_uri: str,
397
+ namespace: str,
398
+ core_api: "CoreV1Api",
399
+ ) -> Tuple[Dict[str, Any], Optional["V1Secret"]]:
400
+ """Apply our default values, return job dict and api key secret.
401
+
402
+ Arguments:
403
+ resource_args (Dict[str, Any]): The resource args to launch.
404
+ launch_project (LaunchProject): The launch project.
405
+ builder (Optional[AbstractBuilder]): The builder.
406
+ namespace (str): The namespace.
407
+ core_api (CoreV1Api): The core api.
408
+
409
+ Returns:
410
+ Tuple[Dict[str, Any], Optional["V1Secret"]]: The resource args and api key secret.
411
+ """
412
+ job: Dict[str, Any] = {
413
+ "apiVersion": "batch/v1",
414
+ "kind": "Job",
415
+ }
416
+ job.update(resource_args)
417
+
418
+ job_metadata: Dict[str, Any] = job.get("metadata", {})
419
+ job_spec: Dict[str, Any] = {"backoffLimit": 0, "ttlSecondsAfterFinished": 60}
420
+ job_spec.update(job.get("spec", {}))
421
+ pod_template: Dict[str, Any] = job_spec.get("template", {})
422
+ pod_spec: Dict[str, Any] = {"restartPolicy": "Never"}
423
+ pod_spec.update(pod_template.get("spec", {}))
424
+ containers: List[Dict[str, Any]] = pod_spec.get("containers", [{}])
425
+
426
+ # Add labels to job metadata
427
+ job_metadata.setdefault("labels", {})
428
+ job_metadata["labels"][WANDB_K8S_RUN_ID] = launch_project.run_id
429
+ job_metadata["labels"][WANDB_K8S_LABEL_MONITOR] = "true"
430
+ if LaunchAgent.initialized():
431
+ job_metadata["labels"][WANDB_K8S_LABEL_AGENT] = LaunchAgent.name()
432
+ # name precedence: name in spec > generated name
433
+ if not job_metadata.get("name"):
434
+ job_metadata["generateName"] = make_name_dns_safe(
435
+ f"launch-{launch_project.target_entity}-{launch_project.target_project}-"
436
+ )
437
+ job_metadata["namespace"] = namespace
438
+
439
+ for i, cont in enumerate(containers):
440
+ if "name" not in cont:
441
+ cont["name"] = cont.get("name", "launch" + str(i))
442
+ if "securityContext" not in cont:
443
+ cont["securityContext"] = {
444
+ "allowPrivilegeEscalation": False,
445
+ "capabilities": {"drop": ["ALL"]},
446
+ "seccompProfile": {"type": "RuntimeDefault"},
447
+ }
448
+
449
+ entry_point = (
450
+ launch_project.override_entrypoint or launch_project.get_job_entry_point()
451
+ )
452
+ if launch_project.docker_image:
453
+ # dont specify run id if user provided image, could have multiple runs
454
+ containers[0]["image"] = image_uri
455
+ # TODO: handle secret pulling image from registry
456
+ elif not any(["image" in cont for cont in containers]):
457
+ assert entry_point is not None
458
+ # in the non instance case we need to make an imagePullSecret
459
+ # so the new job can pull the image
460
+ containers[0]["image"] = image_uri
461
+ secret = await maybe_create_imagepull_secret(
462
+ core_api, self.registry, launch_project.run_id, namespace
463
+ )
464
+ if secret is not None:
465
+ pod_spec["imagePullSecrets"] = [
466
+ {"name": f"regcred-{launch_project.run_id}"}
467
+ ]
468
+
469
+ inject_entrypoint_and_args(
470
+ containers,
471
+ entry_point,
472
+ launch_project.override_args,
473
+ launch_project.override_entrypoint is not None,
474
+ )
475
+
476
+ env_vars = launch_project.get_env_vars_dict(
477
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
478
+ )
479
+ api_key_secret = None
480
+ for cont in containers:
481
+ # Add our env vars to user supplied env vars
482
+ env = cont.get("env") or []
483
+ for key, value in env_vars.items():
484
+ if (
485
+ key == "WANDB_API_KEY"
486
+ and value
487
+ and (
488
+ LaunchAgent.initialized()
489
+ or self.backend_config[PROJECT_SYNCHRONOUS]
490
+ )
491
+ ):
492
+ # Override API key with secret. TODO: Do the same for other runners
493
+ release_name = os.environ.get("WANDB_RELEASE_NAME")
494
+ secret_name = "wandb-api-key"
495
+ if release_name:
496
+ secret_name += f"-{release_name}"
497
+ else:
498
+ secret_name += f"-{launch_project.run_id}"
499
+
500
+ def handle_exception(e):
501
+ wandb.termwarn(
502
+ f"Exception when ensuring Kubernetes API key secret: {e}. Retrying..."
503
+ )
504
+
505
+ api_key_secret = await retry_async(
506
+ backoff=ExponentialBackoff(
507
+ initial_sleep=datetime.timedelta(seconds=1),
508
+ max_sleep=datetime.timedelta(minutes=1),
509
+ max_retries=API_KEY_SECRET_MAX_RETRIES,
510
+ ),
511
+ fn=ensure_api_key_secret,
512
+ on_exc=handle_exception,
513
+ core_api=core_api,
514
+ secret_name=secret_name,
515
+ namespace=namespace,
516
+ api_key=value,
517
+ )
518
+ env.append(
519
+ {
520
+ "name": key,
521
+ "valueFrom": {
522
+ "secretKeyRef": {
523
+ "name": secret_name,
524
+ "key": "password",
525
+ }
526
+ },
527
+ }
528
+ )
529
+ else:
530
+ env.append({"name": key, "value": value})
531
+ cont["env"] = env
532
+
533
+ pod_spec["containers"] = containers
534
+ pod_template["spec"] = pod_spec
535
+ job_spec["template"] = pod_template
536
+ job["spec"] = job_spec
537
+ job["metadata"] = job_metadata
538
+
539
+ add_label_to_pods(
540
+ job,
541
+ WANDB_K8S_LABEL_MONITOR,
542
+ "true",
543
+ )
544
+
545
+ if launch_project.job_base_image:
546
+ apply_code_mount_configuration(
547
+ job,
548
+ launch_project,
549
+ )
550
+
551
+ # Add wandb.ai/agent: current agent label on all pods
552
+ if LaunchAgent.initialized():
553
+ add_label_to_pods(
554
+ job,
555
+ WANDB_K8S_LABEL_AGENT,
556
+ LaunchAgent.name(),
557
+ )
558
+
559
+ return job, api_key_secret
560
+
561
+ async def _wait_for_resource_ready(
562
+ self,
563
+ api_client: kubernetes_asyncio.client.ApiClient,
564
+ config: Dict[str, Any],
565
+ namespace: str,
566
+ timeout_seconds: int = 300,
567
+ ) -> None:
568
+ """Wait for a Kubernetes resource to be ready.
569
+
570
+ Arguments:
571
+ api_client: The Kubernetes API client.
572
+ config: The resource configuration.
573
+ namespace: The namespace where the resource was created.
574
+ timeout_seconds: Maximum time to wait for readiness.
575
+ """
576
+ resource_kind = config.get("kind")
577
+ resource_name = config.get("metadata", {}).get("name")
578
+
579
+ if not resource_kind or not resource_name:
580
+ wandb.termerror(
581
+ f"{LOG_PREFIX}Cannot wait for resource without kind or name"
582
+ )
583
+ return
584
+
585
+ wandb.termlog(
586
+ f"{LOG_PREFIX}Waiting for {resource_kind} '{resource_name}' to be ready..."
587
+ )
588
+
589
+ start_time = time.time()
590
+
591
+ if resource_kind == "Deployment":
592
+ await self._wait_for_deployment_ready(
593
+ api_client, resource_name, namespace, timeout_seconds
594
+ )
595
+ elif resource_kind == "Service":
596
+ await self._wait_for_service_ready(
597
+ api_client, resource_name, namespace, timeout_seconds
598
+ )
599
+ elif resource_kind == "Pod":
600
+ await self._wait_for_pod_ready(
601
+ api_client, resource_name, namespace, timeout_seconds
602
+ )
603
+ else:
604
+ wandb.termlog(
605
+ f"{LOG_PREFIX}No specific readiness check for {resource_kind}, waiting 5 seconds..."
606
+ )
607
+ await asyncio.sleep(5)
608
+
609
+ elapsed = time.time() - start_time
610
+ wandb.termlog(
611
+ f"{LOG_PREFIX}{resource_kind} '{resource_name}' is ready after {elapsed:.1f}s"
612
+ )
613
+
614
+ async def _wait_for_deployment_ready(
615
+ self,
616
+ api_client: kubernetes_asyncio.client.ApiClient,
617
+ name: str,
618
+ namespace: str,
619
+ timeout_seconds: int,
620
+ ) -> None:
621
+ """Wait for a Deployment to be ready."""
622
+ apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
623
+
624
+ async def check_deployment_ready():
625
+ deployment = await apps_api.read_namespaced_deployment(
626
+ name=name, namespace=namespace
627
+ )
628
+ status = deployment.status
629
+
630
+ if status.ready_replicas and status.replicas:
631
+ return status.ready_replicas >= status.replicas
632
+
633
+ return False
634
+
635
+ await self._wait_with_timeout(check_deployment_ready, timeout_seconds, name)
636
+
637
+ async def _wait_for_service_ready(
638
+ self,
639
+ api_client: kubernetes_asyncio.client.ApiClient,
640
+ name: str,
641
+ namespace: str,
642
+ timeout_seconds: int,
643
+ ) -> None:
644
+ """Wait for a Service to have endpoints."""
645
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
646
+
647
+ async def check_service_ready():
648
+ endpoints = await core_api.read_namespaced_endpoints(
649
+ name=name, namespace=namespace
650
+ )
651
+ if endpoints.subsets:
652
+ for subset in endpoints.subsets:
653
+ if subset.addresses: # These are ready pod addresses
654
+ return True
655
+ return False
656
+
657
+ await self._wait_with_timeout(check_service_ready, timeout_seconds, name)
658
+
659
+ async def _wait_for_pod_ready(
660
+ self,
661
+ api_client: kubernetes_asyncio.client.ApiClient,
662
+ name: str,
663
+ namespace: str,
664
+ timeout_seconds: int,
665
+ ) -> None:
666
+ """Wait for a Pod to be ready."""
667
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
668
+
669
+ async def check_pod_ready():
670
+ pod = await core_api.read_namespaced_pod(name=name, namespace=namespace)
671
+ if pod.status.phase == "Running":
672
+ if pod.status.container_statuses:
673
+ return all(status.ready for status in pod.status.container_statuses)
674
+ return True
675
+ return False
676
+
677
+ await self._wait_with_timeout(check_pod_ready, timeout_seconds, name)
678
+
679
+ async def _wait_with_timeout(
680
+ self, check_func, timeout_seconds: int, name: str
681
+ ) -> None:
682
+ """Generic timeout wrapper for readiness checks."""
683
+ start_time = time.time()
684
+
685
+ while time.time() - start_time < timeout_seconds:
686
+ try:
687
+ if await check_func():
688
+ return
689
+ except kubernetes_asyncio.client.ApiException as e:
690
+ if e.status == 404:
691
+ pass
692
+ else:
693
+ wandb.termerror(
694
+ f"{LOG_PREFIX}Error waiting for resource '{name}': {e}"
695
+ )
696
+ raise
697
+ except Exception as e:
698
+ wandb.termerror(f"{LOG_PREFIX}Error waiting for resource '{name}': {e}")
699
+ raise
700
+ await asyncio.sleep(2)
701
+
702
+ raise LaunchError(
703
+ f"Resource '{name}' not ready within {timeout_seconds} seconds"
704
+ )
705
+
706
+ async def _prepare_resource(
707
+ self,
708
+ api_client: kubernetes_asyncio.client.ApiClient,
709
+ config: Dict[str, Any],
710
+ namespace: str,
711
+ run_id: str,
712
+ launch_project: LaunchProject,
713
+ api_key_secret: Optional["V1Secret"] = None,
714
+ wait_for_ready: bool = True,
715
+ wait_timeout: int = 300,
716
+ auxiliary_resource_label_value: Optional[str] = None,
717
+ ) -> None:
718
+ """Prepare a service for launch.
719
+
720
+ Arguments:
721
+ api_client: The Kubernetes API client.
722
+ config: The resource configuration to prepare.
723
+ namespace: The namespace to create the resource in.
724
+ run_id: The run ID to label the resource with.
725
+ launch_project: The launch project to get environment variables from.
726
+ api_key_secret: The API key secret to inject.
727
+ wait_for_ready: Whether to wait for the resource to be ready after creation.
728
+ wait_timeout: Maximum time in seconds to wait for resource readiness.
729
+ """
730
+ config.setdefault("metadata", {})
731
+ config["metadata"].setdefault("labels", {})
732
+ config["metadata"]["labels"][WANDB_K8S_RUN_ID] = run_id
733
+ config["metadata"]["labels"]["wandb.ai/created-by"] = "launch-agent"
734
+ if auxiliary_resource_label_value:
735
+ config["metadata"]["labels"][WANDB_K8S_LABEL_AUXILIARY_RESOURCE] = (
736
+ auxiliary_resource_label_value
737
+ )
738
+
739
+ env_vars = launch_project.get_env_vars_dict(
740
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
741
+ )
742
+ wandb_config_env = {
743
+ "WANDB_CONFIG": env_vars.get("WANDB_CONFIG", "{}"),
744
+ }
745
+ add_wandb_env(config, wandb_config_env)
746
+
747
+ if auxiliary_resource_label_value:
748
+ add_label_to_pods(
749
+ config,
750
+ WANDB_K8S_LABEL_AUXILIARY_RESOURCE,
751
+ auxiliary_resource_label_value,
752
+ )
753
+
754
+ if api_key_secret:
755
+ for cont in yield_containers(config):
756
+ env = cont.setdefault("env", [])
757
+ env.append(
758
+ {
759
+ "name": "WANDB_API_KEY",
760
+ "valueFrom": {
761
+ "secretKeyRef": {
762
+ "name": api_key_secret.metadata.name,
763
+ "key": "password",
764
+ }
765
+ },
766
+ }
767
+ )
768
+ cont["env"] = env
769
+
770
+ try:
771
+ sanitize_identifiers_for_k8s(config)
772
+
773
+ await kubernetes_asyncio.utils.create_from_dict(
774
+ api_client, config, namespace=namespace
775
+ )
776
+
777
+ if wait_for_ready:
778
+ await self._wait_for_resource_ready(
779
+ api_client, config, namespace, wait_timeout
780
+ )
781
+ except Exception as e:
782
+ wandb.termerror(f"{LOG_PREFIX}Failed to create Kubernetes resource: {e}")
783
+ raise LaunchError(f"Failed to create Kubernetes resource: {e}")
784
+
785
+ async def run(
786
+ self, launch_project: LaunchProject, image_uri: str
787
+ ) -> Optional[AbstractRun]:
788
+ """Execute a launch project on Kubernetes.
789
+
790
+ Arguments:
791
+ launch_project: The launch project to execute.
792
+ builder: The builder to use to build the image.
793
+
794
+ Returns:
795
+ The run object if the run was successful, otherwise None.
796
+ """
797
+ await LaunchKubernetesMonitor.ensure_initialized()
798
+ resource_args = launch_project.fill_macros(image_uri).get("kubernetes", {})
799
+ if not resource_args:
800
+ wandb.termlog(
801
+ f"{LOG_PREFIX}Note: no resource args specified. Add a "
802
+ "Kubernetes yaml spec or other options in a json file "
803
+ "with --resource-args <json>."
804
+ )
805
+ _logger.info(f"Running Kubernetes job with resource args: {resource_args}")
806
+
807
+ context, api_client = await get_kube_context_and_api_client(
808
+ kubernetes_asyncio, resource_args
809
+ )
810
+
811
+ # If using pvc for code mount, move code there.
812
+ if launch_project.job_base_image is not None:
813
+ if SOURCE_CODE_PVC_NAME is None or SOURCE_CODE_PVC_MOUNT_PATH is None:
814
+ raise LaunchError(
815
+ "WANDB_LAUNCH_SOURCE_CODE_PVC_ environment variables not set. "
816
+ "Unable to mount source code PVC into base image. "
817
+ "Use the `codeMountPvcName` variable in the agent helm chart "
818
+ "to enable base image jobs for this agent. See "
819
+ "https://github.com/wandb/helm-charts/tree/main/charts/launch-agent "
820
+ "for more information."
821
+ )
822
+ code_subdir = launch_project.get_image_source_string()
823
+ launch_project.change_project_dir(
824
+ os.path.join(SOURCE_CODE_PVC_MOUNT_PATH, code_subdir)
825
+ )
826
+
827
+ # If the user specified an alternate api, we need will execute this
828
+ # run by creating a custom object.
829
+ api_version = resource_args.get("apiVersion", "batch/v1")
830
+
831
+ if api_version not in ["batch/v1", "batch/v1beta1"]:
832
+ env_vars = launch_project.get_env_vars_dict(
833
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
834
+ )
835
+ # Crawl the resource args and add our env vars to the containers.
836
+ add_wandb_env(resource_args, env_vars)
837
+
838
+ # Add our labels to the resource args. This is necessary for the
839
+ # agent to find the custom object later on.
840
+ resource_args["metadata"] = resource_args.get("metadata", {})
841
+ resource_args["metadata"]["labels"] = resource_args["metadata"].get(
842
+ "labels", {}
843
+ )
844
+ resource_args["metadata"]["labels"][WANDB_K8S_LABEL_MONITOR] = "true"
845
+
846
+ # Crawl the resource arsg and add our labels to the pods. This is
847
+ # necessary for the agent to find the pods later on.
848
+ add_label_to_pods(
849
+ resource_args,
850
+ WANDB_K8S_LABEL_MONITOR,
851
+ "true",
852
+ )
853
+
854
+ # Add wandb.ai/agent: current agent label on all pods
855
+ if LaunchAgent.initialized():
856
+ add_label_to_pods(
857
+ resource_args,
858
+ WANDB_K8S_LABEL_AGENT,
859
+ LaunchAgent.name(),
860
+ )
861
+ resource_args["metadata"]["labels"][WANDB_K8S_LABEL_AGENT] = (
862
+ LaunchAgent.name()
863
+ )
864
+
865
+ if launch_project.job_base_image:
866
+ apply_code_mount_configuration(resource_args, launch_project)
867
+
868
+ overrides = {}
869
+ if launch_project.override_args:
870
+ overrides["args"] = launch_project.override_args
871
+ if launch_project.override_entrypoint:
872
+ overrides["command"] = launch_project.override_entrypoint.command
873
+ add_entrypoint_args_overrides(
874
+ resource_args,
875
+ overrides,
876
+ )
877
+ api = client.CustomObjectsApi(api_client)
878
+ # Infer the attributes of a custom object from the apiVersion and/or
879
+ # a kind: attribute in the resource args.
880
+ namespace = self.get_namespace(resource_args, context)
881
+ group, version, *_ = api_version.split("/")
882
+ group = resource_args.get("group", group)
883
+ version = resource_args.get("version", version)
884
+ kind = resource_args.get("kind", version)
885
+ plural = f"{kind.lower()}s"
886
+ custom_resource = CustomResource(
887
+ group=group,
888
+ version=version,
889
+ plural=plural,
890
+ )
891
+ LaunchKubernetesMonitor.monitor_namespace(
892
+ namespace, custom_resource=custom_resource
893
+ )
894
+
895
+ try:
896
+ response = await api.create_namespaced_custom_object(
897
+ group=group,
898
+ version=version,
899
+ namespace=namespace,
900
+ plural=plural,
901
+ body=resource_args,
902
+ )
903
+ except ApiException as e:
904
+ body = json.loads(e.body)
905
+ body_yaml = yaml.dump(body)
906
+ raise LaunchError(
907
+ f"Error creating CRD of kind {kind}: {e.status} {e.reason}\n{body_yaml}"
908
+ ) from e
909
+ name = response.get("metadata", {}).get("name")
910
+ _logger.info(f"Created {kind} {response['metadata']['name']}")
911
+ submitted_run = CrdSubmittedRun(
912
+ name=name,
913
+ group=group,
914
+ version=version,
915
+ namespace=namespace,
916
+ plural=plural,
917
+ core_api=client.CoreV1Api(api_client),
918
+ custom_api=api,
919
+ )
920
+ if self.backend_config[PROJECT_SYNCHRONOUS]:
921
+ await submitted_run.wait()
922
+ return submitted_run
923
+
924
+ batch_api = kubernetes_asyncio.client.BatchV1Api(api_client)
925
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
926
+ apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
927
+ network_api = kubernetes_asyncio.client.NetworkingV1Api(api_client)
928
+
929
+ namespace = self.get_namespace(resource_args, context)
930
+ job, secret = await self._inject_defaults(
931
+ resource_args, launch_project, image_uri, namespace, core_api
932
+ )
933
+
934
+ update_dict = {
935
+ "project_name": launch_project.target_project,
936
+ "entity_name": launch_project.target_entity,
937
+ "run_id": launch_project.run_id,
938
+ "run_name": launch_project.name,
939
+ "image_uri": image_uri,
940
+ "author": launch_project.author,
941
+ }
942
+ update_dict.update(os.environ)
943
+ additional_services: List[Dict[str, Any]] = recursive_macro_sub(
944
+ launch_project.launch_spec.get("additional_services", []), update_dict
945
+ )
946
+ auxiliary_resource_label_value = make_k8s_label_safe(
947
+ f"aux-{launch_project.target_entity}-{launch_project.target_project}-{launch_project.run_id}"
948
+ )
949
+ if additional_services:
950
+ wandb.termlog(
951
+ f"{LOG_PREFIX}Creating additional services: {additional_services}"
952
+ )
953
+
954
+ wait_for_ready = resource_args.get("wait_for_ready", True)
955
+ wait_timeout = resource_args.get("wait_timeout", 300)
956
+
957
+ await asyncio.gather(
958
+ *[
959
+ self._prepare_resource(
960
+ api_client,
961
+ resource.get("config", {}),
962
+ namespace,
963
+ launch_project.run_id,
964
+ launch_project,
965
+ secret,
966
+ wait_for_ready,
967
+ wait_timeout,
968
+ auxiliary_resource_label_value,
969
+ )
970
+ for resource in additional_services
971
+ if resource.get("config", {})
972
+ ]
973
+ )
974
+
975
+ msg = "Creating Kubernetes job"
976
+ if "name" in resource_args:
977
+ msg += f": {resource_args['name']}"
978
+ _logger.info(msg)
979
+ try:
980
+ response = await kubernetes_asyncio.utils.create_from_dict(
981
+ api_client, job, namespace=namespace
982
+ )
983
+ except kubernetes_asyncio.utils.FailToCreateError as e:
984
+ for exc in e.api_exceptions:
985
+ resp = json.loads(exc.body)
986
+ msg = resp.get("message")
987
+ code = resp.get("code")
988
+ raise LaunchError(
989
+ f"Failed to create Kubernetes job for run {launch_project.run_id} ({code} {exc.reason}): {msg}"
990
+ )
991
+ except Exception as e:
992
+ raise LaunchError(
993
+ f"Unexpected exception when creating Kubernetes job: {str(e)}\n"
994
+ )
995
+ job_response = response[0]
996
+ job_name = job_response.metadata.name
997
+ LaunchKubernetesMonitor.monitor_namespace(namespace)
998
+ submitted_job = KubernetesSubmittedRun(
999
+ batch_api,
1000
+ core_api,
1001
+ apps_api,
1002
+ network_api,
1003
+ job_name,
1004
+ namespace,
1005
+ secret,
1006
+ auxiliary_resource_label_value,
1007
+ )
1008
+ if self.backend_config[PROJECT_SYNCHRONOUS]:
1009
+ await submitted_job.wait()
1010
+
1011
+ return submitted_job
1012
+
1013
+
1014
+ def inject_entrypoint_and_args(
1015
+ containers: List[dict],
1016
+ entry_point: Optional[EntryPoint],
1017
+ override_args: List[str],
1018
+ should_override_entrypoint: bool,
1019
+ ) -> None:
1020
+ """Inject the entrypoint and args into the containers.
1021
+
1022
+ Arguments:
1023
+ containers: The containers to inject the entrypoint and args into.
1024
+ entry_point: The entrypoint to inject.
1025
+ override_args: The args to inject.
1026
+ should_override_entrypoint: Whether to override the entrypoint.
1027
+
1028
+ Returns:
1029
+ None
1030
+ """
1031
+ for i in range(len(containers)):
1032
+ if override_args:
1033
+ containers[i]["args"] = override_args
1034
+ if entry_point and (
1035
+ not containers[i].get("command") or should_override_entrypoint
1036
+ ):
1037
+ containers[i]["command"] = entry_point.command
1038
+
1039
+
1040
+ async def ensure_api_key_secret(
1041
+ core_api: "CoreV1Api",
1042
+ secret_name: str,
1043
+ namespace: str,
1044
+ api_key: str,
1045
+ ) -> "V1Secret":
1046
+ """Create a secret containing a user's wandb API key.
1047
+
1048
+ Arguments:
1049
+ core_api: The Kubernetes CoreV1Api object.
1050
+ secret_name: The name to use for the secret.
1051
+ namespace: The namespace to create the secret in.
1052
+ api_key: The user's wandb API key
1053
+
1054
+ Returns:
1055
+ The created secret
1056
+ """
1057
+ secret_data = {"password": base64.b64encode(api_key.encode()).decode()}
1058
+ labels = {"wandb.ai/created-by": "launch-agent"}
1059
+ secret = client.V1Secret(
1060
+ data=secret_data,
1061
+ metadata=client.V1ObjectMeta(
1062
+ name=secret_name, namespace=namespace, labels=labels
1063
+ ),
1064
+ kind="Secret",
1065
+ type="kubernetes.io/basic-auth",
1066
+ )
1067
+
1068
+ try:
1069
+ try:
1070
+ return await core_api.create_namespaced_secret(namespace, secret)
1071
+ except ApiException as e:
1072
+ # 409 = conflict = secret already exists
1073
+ if e.status == 409:
1074
+ existing_secret = await core_api.read_namespaced_secret(
1075
+ name=secret_name, namespace=namespace
1076
+ )
1077
+ if existing_secret.data != secret_data:
1078
+ # If it's a previous secret made by launch agent, clean it up
1079
+ if (
1080
+ existing_secret.metadata.labels.get("wandb.ai/created-by")
1081
+ == "launch-agent"
1082
+ ):
1083
+ await core_api.delete_namespaced_secret(
1084
+ name=secret_name, namespace=namespace
1085
+ )
1086
+ return await core_api.create_namespaced_secret(
1087
+ namespace, secret
1088
+ )
1089
+ else:
1090
+ raise LaunchError(
1091
+ f"Kubernetes secret already exists in namespace {namespace} with incorrect data: {secret_name}"
1092
+ )
1093
+ return existing_secret
1094
+ raise
1095
+ except Exception as e:
1096
+ raise LaunchError(
1097
+ f"Exception when ensuring Kubernetes API key secret: {str(e)}\n"
1098
+ )
1099
+
1100
+
1101
+ async def maybe_create_imagepull_secret(
1102
+ core_api: "CoreV1Api",
1103
+ registry: AbstractRegistry,
1104
+ run_id: str,
1105
+ namespace: str,
1106
+ ) -> Optional["V1Secret"]:
1107
+ """Create a secret for pulling images from a private registry.
1108
+
1109
+ Arguments:
1110
+ core_api: The Kubernetes CoreV1Api object.
1111
+ registry: The registry to pull from.
1112
+ run_id: The run id.
1113
+ namespace: The namespace to create the secret in.
1114
+
1115
+ Returns:
1116
+ A secret if one was created, otherwise None.
1117
+ """
1118
+ secret = None
1119
+ if isinstance(registry, LocalRegistry) or isinstance(
1120
+ registry, AzureContainerRegistry
1121
+ ):
1122
+ # Secret not required
1123
+ return None
1124
+ uname, token = await registry.get_username_password()
1125
+ creds_info = {
1126
+ "auths": {
1127
+ registry.uri: {
1128
+ "auth": base64.b64encode(f"{uname}:{token}".encode()).decode(),
1129
+ # need an email but the use is deprecated
1130
+ "email": "deprecated@wandblaunch.com",
1131
+ }
1132
+ }
1133
+ }
1134
+ secret_data = {
1135
+ ".dockerconfigjson": base64.b64encode(json.dumps(creds_info).encode()).decode()
1136
+ }
1137
+ secret = client.V1Secret(
1138
+ data=secret_data,
1139
+ metadata=client.V1ObjectMeta(name=f"regcred-{run_id}", namespace=namespace),
1140
+ kind="Secret",
1141
+ type="kubernetes.io/dockerconfigjson",
1142
+ )
1143
+ try:
1144
+ try:
1145
+ return await core_api.create_namespaced_secret(namespace, secret)
1146
+ except ApiException as e:
1147
+ # 409 = conflict = secret already exists
1148
+ if e.status == 409:
1149
+ return await core_api.read_namespaced_secret(
1150
+ name=f"regcred-{run_id}", namespace=namespace
1151
+ )
1152
+ raise
1153
+ except Exception as e:
1154
+ raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")
1155
+
1156
+
1157
+ def add_wandb_env(root: Union[dict, list], env_vars: Dict[str, str]) -> None:
1158
+ """Injects wandb environment variables into specs.
1159
+
1160
+ Recursively walks the spec and injects the environment variables into
1161
+ every container spec. Containers are identified by the "containers" key.
1162
+
1163
+ This function treats the WANDB_RUN_ID and WANDB_GROUP_ID environment variables
1164
+ specially. If they are present in the spec, they will be overwritten. If a setting
1165
+ for WANDB_RUN_ID is provided in env_vars, then that environment variable will only be
1166
+ set in the first container modified by this function.
1167
+
1168
+ Arguments:
1169
+ root: The spec to modify.
1170
+ env_vars: The environment variables to inject.
1171
+
1172
+ Returns: None.
1173
+ """
1174
+ for cont in yield_containers(root):
1175
+ env = cont.setdefault("env", [])
1176
+ env.extend([{"name": key, "value": value} for key, value in env_vars.items()])
1177
+ cont["env"] = env
1178
+ # After we have set WANDB_RUN_ID once, we don't want to set it again
1179
+ if "WANDB_RUN_ID" in env_vars:
1180
+ env_vars.pop("WANDB_RUN_ID")
1181
+
1182
+
1183
+ def yield_pods(manifest: Any) -> Iterator[dict]:
1184
+ """Yield all pod specs in a manifest.
1185
+
1186
+ Recursively traverses the manifest and yields all pod specs. Pod specs are
1187
+ identified by the presence of a "spec" key with a "containers" key in the
1188
+ value.
1189
+ """
1190
+ if isinstance(manifest, list):
1191
+ for item in manifest:
1192
+ yield from yield_pods(item)
1193
+ elif isinstance(manifest, dict):
1194
+ if "spec" in manifest and "containers" in manifest["spec"]:
1195
+ yield manifest
1196
+ for value in manifest.values():
1197
+ if isinstance(value, (dict, list)):
1198
+ yield from yield_pods(value)
1199
+
1200
+
1201
+ def add_label_to_pods(
1202
+ manifest: Union[dict, list], label_key: str, label_value: str
1203
+ ) -> None:
1204
+ """Add a label to all pod specs in a manifest.
1205
+
1206
+ Recursively traverses the manifest and adds the label to all pod specs.
1207
+ Pod specs are identified by the presence of a "spec" key with a "containers"
1208
+ key in the value.
1209
+
1210
+ Arguments:
1211
+ manifest: The manifest to modify.
1212
+ label_key: The label key to add.
1213
+ label_value: The label value to add.
1214
+
1215
+ Returns: None.
1216
+ """
1217
+ for pod in yield_pods(manifest):
1218
+ metadata = pod.setdefault("metadata", {})
1219
+ labels = metadata.setdefault("labels", {})
1220
+ labels[label_key] = label_value
1221
+
1222
+
1223
+ def add_entrypoint_args_overrides(manifest: Union[dict, list], overrides: dict) -> None:
1224
+ """Add entrypoint and args overrides to all containers in a manifest.
1225
+
1226
+ Recursively traverses the manifest and adds the entrypoint and args overrides
1227
+ to all containers. Containers are identified by the presence of a "spec" key
1228
+ with a "containers" key in the value.
1229
+
1230
+ Arguments:
1231
+ manifest: The manifest to modify.
1232
+ overrides: Dictionary with args and entrypoint keys.
1233
+
1234
+ Returns: None.
1235
+ """
1236
+ if isinstance(manifest, list):
1237
+ for item in manifest:
1238
+ add_entrypoint_args_overrides(item, overrides)
1239
+ elif isinstance(manifest, dict):
1240
+ if "spec" in manifest and "containers" in manifest["spec"]:
1241
+ containers = manifest["spec"]["containers"]
1242
+ for container in containers:
1243
+ if "command" in overrides:
1244
+ container["command"] = overrides["command"]
1245
+ if "args" in overrides:
1246
+ container["args"] = overrides["args"]
1247
+ for value in manifest.values():
1248
+ add_entrypoint_args_overrides(value, overrides)
1249
+
1250
+
1251
+ def apply_code_mount_configuration(
1252
+ manifest: Union[Dict, list], project: LaunchProject
1253
+ ) -> None:
1254
+ """Apply code mount configuration to all containers in a manifest.
1255
+
1256
+ Recursively traverses the manifest and adds the code mount configuration to
1257
+ all containers. Containers are identified by the presence of a "spec" key
1258
+ with a "containers" key in the value.
1259
+
1260
+ Arguments:
1261
+ manifest: The manifest to modify.
1262
+ project: The launch project.
1263
+
1264
+ Returns: None.
1265
+ """
1266
+ assert SOURCE_CODE_PVC_NAME is not None
1267
+ source_dir = project.get_image_source_string()
1268
+ for pod in yield_pods(manifest):
1269
+ for container in yield_containers(pod):
1270
+ if "volumeMounts" not in container:
1271
+ container["volumeMounts"] = []
1272
+ container["volumeMounts"].append(
1273
+ {
1274
+ "name": "wandb-source-code-volume",
1275
+ "mountPath": CODE_MOUNT_DIR,
1276
+ "subPath": source_dir,
1277
+ }
1278
+ )
1279
+ container["workingDir"] = CODE_MOUNT_DIR
1280
+ spec = pod["spec"]
1281
+ if "volumes" not in spec:
1282
+ spec["volumes"] = []
1283
+ spec["volumes"].append(
1284
+ {
1285
+ "name": "wandb-source-code-volume",
1286
+ "persistentVolumeClaim": {
1287
+ "claimName": SOURCE_CODE_PVC_NAME,
1288
+ },
1289
+ }
1290
+ )