wandb 0.21.2__py3-none-macosx_12_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (904) hide show
  1. package_readme.md +97 -0
  2. wandb/__init__.py +248 -0
  3. wandb/__init__.pyi +1230 -0
  4. wandb/__main__.py +3 -0
  5. wandb/_iterutils.py +65 -0
  6. wandb/_pydantic/__init__.py +30 -0
  7. wandb/_pydantic/base.py +128 -0
  8. wandb/_pydantic/utils.py +80 -0
  9. wandb/_pydantic/v1_compat.py +284 -0
  10. wandb/agents/__init__.py +0 -0
  11. wandb/agents/pyagent.py +386 -0
  12. wandb/analytics/__init__.py +3 -0
  13. wandb/analytics/sentry.py +267 -0
  14. wandb/apis/__init__.py +48 -0
  15. wandb/apis/attrs.py +50 -0
  16. wandb/apis/importers/__init__.py +1 -0
  17. wandb/apis/importers/internals/internal.py +382 -0
  18. wandb/apis/importers/internals/protocols.py +103 -0
  19. wandb/apis/importers/internals/util.py +78 -0
  20. wandb/apis/importers/mlflow.py +254 -0
  21. wandb/apis/importers/validation.py +108 -0
  22. wandb/apis/importers/wandb.py +1608 -0
  23. wandb/apis/internal.py +239 -0
  24. wandb/apis/normalize.py +81 -0
  25. wandb/apis/paginator.py +138 -0
  26. wandb/apis/public/__init__.py +35 -0
  27. wandb/apis/public/api.py +2449 -0
  28. wandb/apis/public/artifacts.py +1046 -0
  29. wandb/apis/public/automations.py +85 -0
  30. wandb/apis/public/const.py +4 -0
  31. wandb/apis/public/files.py +402 -0
  32. wandb/apis/public/history.py +201 -0
  33. wandb/apis/public/integrations.py +203 -0
  34. wandb/apis/public/jobs.py +742 -0
  35. wandb/apis/public/projects.py +276 -0
  36. wandb/apis/public/query_generator.py +176 -0
  37. wandb/apis/public/registries/__init__.py +0 -0
  38. wandb/apis/public/registries/_freezable_list.py +179 -0
  39. wandb/apis/public/registries/_utils.py +138 -0
  40. wandb/apis/public/registries/registries_search.py +347 -0
  41. wandb/apis/public/registries/registry.py +358 -0
  42. wandb/apis/public/reports.py +595 -0
  43. wandb/apis/public/runs.py +1216 -0
  44. wandb/apis/public/sweeps.py +440 -0
  45. wandb/apis/public/teams.py +235 -0
  46. wandb/apis/public/users.py +177 -0
  47. wandb/apis/public/utils.py +210 -0
  48. wandb/apis/reports/__init__.py +1 -0
  49. wandb/apis/reports/v1/__init__.py +8 -0
  50. wandb/apis/reports/v2/__init__.py +8 -0
  51. wandb/apis/workspaces/__init__.py +8 -0
  52. wandb/automations/__init__.py +73 -0
  53. wandb/automations/_filters/__init__.py +40 -0
  54. wandb/automations/_filters/expressions.py +181 -0
  55. wandb/automations/_filters/operators.py +258 -0
  56. wandb/automations/_filters/run_metrics.py +330 -0
  57. wandb/automations/_generated/__init__.py +177 -0
  58. wandb/automations/_generated/create_automation.py +17 -0
  59. wandb/automations/_generated/create_generic_webhook_integration.py +43 -0
  60. wandb/automations/_generated/delete_automation.py +15 -0
  61. wandb/automations/_generated/enums.py +35 -0
  62. wandb/automations/_generated/fragments.py +358 -0
  63. wandb/automations/_generated/generic_webhook_integrations_by_entity.py +22 -0
  64. wandb/automations/_generated/get_automations.py +24 -0
  65. wandb/automations/_generated/get_automations_by_entity.py +26 -0
  66. wandb/automations/_generated/input_types.py +104 -0
  67. wandb/automations/_generated/integrations_by_entity.py +22 -0
  68. wandb/automations/_generated/operations.py +647 -0
  69. wandb/automations/_generated/slack_integrations_by_entity.py +22 -0
  70. wandb/automations/_generated/update_automation.py +17 -0
  71. wandb/automations/_utils.py +235 -0
  72. wandb/automations/_validators.py +165 -0
  73. wandb/automations/actions.py +218 -0
  74. wandb/automations/automations.py +85 -0
  75. wandb/automations/events.py +285 -0
  76. wandb/automations/integrations.py +45 -0
  77. wandb/automations/scopes.py +78 -0
  78. wandb/beta/workflows.py +324 -0
  79. wandb/bin/gpu_stats +0 -0
  80. wandb/bin/wandb-core +0 -0
  81. wandb/cli/__init__.py +0 -0
  82. wandb/cli/beta.py +175 -0
  83. wandb/cli/cli.py +2883 -0
  84. wandb/data_types.py +66 -0
  85. wandb/docker/__init__.py +290 -0
  86. wandb/docker/names.py +40 -0
  87. wandb/docker/wandb-entrypoint.sh +33 -0
  88. wandb/env.py +535 -0
  89. wandb/errors/__init__.py +17 -0
  90. wandb/errors/errors.py +40 -0
  91. wandb/errors/links.py +73 -0
  92. wandb/errors/term.py +415 -0
  93. wandb/errors/util.py +57 -0
  94. wandb/errors/warnings.py +2 -0
  95. wandb/filesync/__init__.py +0 -0
  96. wandb/filesync/dir_watcher.py +404 -0
  97. wandb/filesync/stats.py +100 -0
  98. wandb/filesync/step_checksum.py +142 -0
  99. wandb/filesync/step_prepare.py +179 -0
  100. wandb/filesync/step_upload.py +287 -0
  101. wandb/filesync/upload_job.py +142 -0
  102. wandb/integration/__init__.py +0 -0
  103. wandb/integration/catboost/__init__.py +5 -0
  104. wandb/integration/catboost/catboost.py +182 -0
  105. wandb/integration/cohere/__init__.py +3 -0
  106. wandb/integration/cohere/cohere.py +21 -0
  107. wandb/integration/cohere/resolver.py +347 -0
  108. wandb/integration/diffusers/__init__.py +3 -0
  109. wandb/integration/diffusers/autologger.py +76 -0
  110. wandb/integration/diffusers/pipeline_resolver.py +50 -0
  111. wandb/integration/diffusers/resolvers/__init__.py +9 -0
  112. wandb/integration/diffusers/resolvers/multimodal.py +881 -0
  113. wandb/integration/diffusers/resolvers/utils.py +102 -0
  114. wandb/integration/fastai/__init__.py +243 -0
  115. wandb/integration/gym/__init__.py +98 -0
  116. wandb/integration/huggingface/__init__.py +3 -0
  117. wandb/integration/huggingface/huggingface.py +18 -0
  118. wandb/integration/huggingface/resolver.py +213 -0
  119. wandb/integration/keras/__init__.py +11 -0
  120. wandb/integration/keras/callbacks/__init__.py +5 -0
  121. wandb/integration/keras/callbacks/metrics_logger.py +129 -0
  122. wandb/integration/keras/callbacks/model_checkpoint.py +188 -0
  123. wandb/integration/keras/callbacks/tables_builder.py +228 -0
  124. wandb/integration/keras/keras.py +1086 -0
  125. wandb/integration/kfp/__init__.py +6 -0
  126. wandb/integration/kfp/helpers.py +28 -0
  127. wandb/integration/kfp/kfp_patch.py +335 -0
  128. wandb/integration/kfp/wandb_logging.py +182 -0
  129. wandb/integration/langchain/__init__.py +3 -0
  130. wandb/integration/langchain/wandb_tracer.py +49 -0
  131. wandb/integration/lightgbm/__init__.py +239 -0
  132. wandb/integration/lightning/__init__.py +0 -0
  133. wandb/integration/lightning/fabric/__init__.py +3 -0
  134. wandb/integration/lightning/fabric/logger.py +763 -0
  135. wandb/integration/metaflow/__init__.py +9 -0
  136. wandb/integration/metaflow/data_pandas.py +74 -0
  137. wandb/integration/metaflow/data_pytorch.py +75 -0
  138. wandb/integration/metaflow/data_sklearn.py +76 -0
  139. wandb/integration/metaflow/errors.py +13 -0
  140. wandb/integration/metaflow/metaflow.py +327 -0
  141. wandb/integration/openai/__init__.py +3 -0
  142. wandb/integration/openai/fine_tuning.py +480 -0
  143. wandb/integration/openai/openai.py +22 -0
  144. wandb/integration/openai/resolver.py +240 -0
  145. wandb/integration/prodigy/__init__.py +3 -0
  146. wandb/integration/prodigy/prodigy.py +291 -0
  147. wandb/integration/sacred/__init__.py +117 -0
  148. wandb/integration/sagemaker/__init__.py +14 -0
  149. wandb/integration/sagemaker/auth.py +29 -0
  150. wandb/integration/sagemaker/config.py +58 -0
  151. wandb/integration/sagemaker/files.py +2 -0
  152. wandb/integration/sagemaker/resources.py +63 -0
  153. wandb/integration/sb3/__init__.py +3 -0
  154. wandb/integration/sb3/sb3.py +147 -0
  155. wandb/integration/sklearn/__init__.py +37 -0
  156. wandb/integration/sklearn/calculate/__init__.py +32 -0
  157. wandb/integration/sklearn/calculate/calibration_curves.py +125 -0
  158. wandb/integration/sklearn/calculate/class_proportions.py +68 -0
  159. wandb/integration/sklearn/calculate/confusion_matrix.py +93 -0
  160. wandb/integration/sklearn/calculate/decision_boundaries.py +40 -0
  161. wandb/integration/sklearn/calculate/elbow_curve.py +55 -0
  162. wandb/integration/sklearn/calculate/feature_importances.py +67 -0
  163. wandb/integration/sklearn/calculate/learning_curve.py +64 -0
  164. wandb/integration/sklearn/calculate/outlier_candidates.py +69 -0
  165. wandb/integration/sklearn/calculate/residuals.py +86 -0
  166. wandb/integration/sklearn/calculate/silhouette.py +118 -0
  167. wandb/integration/sklearn/calculate/summary_metrics.py +62 -0
  168. wandb/integration/sklearn/plot/__init__.py +35 -0
  169. wandb/integration/sklearn/plot/classifier.py +329 -0
  170. wandb/integration/sklearn/plot/clusterer.py +146 -0
  171. wandb/integration/sklearn/plot/regressor.py +121 -0
  172. wandb/integration/sklearn/plot/shared.py +91 -0
  173. wandb/integration/sklearn/utils.py +184 -0
  174. wandb/integration/tensorboard/__init__.py +10 -0
  175. wandb/integration/tensorboard/log.py +351 -0
  176. wandb/integration/tensorboard/monkeypatch.py +186 -0
  177. wandb/integration/tensorflow/__init__.py +5 -0
  178. wandb/integration/tensorflow/estimator_hook.py +54 -0
  179. wandb/integration/torch/__init__.py +0 -0
  180. wandb/integration/torch/wandb_torch.py +554 -0
  181. wandb/integration/ultralytics/__init__.py +11 -0
  182. wandb/integration/ultralytics/bbox_utils.py +215 -0
  183. wandb/integration/ultralytics/callback.py +528 -0
  184. wandb/integration/ultralytics/classification_utils.py +83 -0
  185. wandb/integration/ultralytics/mask_utils.py +202 -0
  186. wandb/integration/ultralytics/pose_utils.py +103 -0
  187. wandb/integration/weave/__init__.py +6 -0
  188. wandb/integration/weave/interface.py +49 -0
  189. wandb/integration/weave/weave.py +63 -0
  190. wandb/integration/xgboost/__init__.py +11 -0
  191. wandb/integration/xgboost/xgboost.py +189 -0
  192. wandb/integration/yolov8/__init__.py +0 -0
  193. wandb/integration/yolov8/yolov8.py +284 -0
  194. wandb/jupyter.py +538 -0
  195. wandb/mpmain/__init__.py +0 -0
  196. wandb/mpmain/__main__.py +1 -0
  197. wandb/old/__init__.py +0 -0
  198. wandb/old/core.py +53 -0
  199. wandb/old/settings.py +176 -0
  200. wandb/old/summary.py +438 -0
  201. wandb/plot/__init__.py +30 -0
  202. wandb/plot/bar.py +71 -0
  203. wandb/plot/confusion_matrix.py +185 -0
  204. wandb/plot/custom_chart.py +147 -0
  205. wandb/plot/histogram.py +66 -0
  206. wandb/plot/line.py +75 -0
  207. wandb/plot/line_series.py +173 -0
  208. wandb/plot/pr_curve.py +186 -0
  209. wandb/plot/roc_curve.py +163 -0
  210. wandb/plot/scatter.py +66 -0
  211. wandb/plot/utils.py +184 -0
  212. wandb/plot/viz.py +41 -0
  213. wandb/proto/__init__.py +0 -0
  214. wandb/proto/v3/__init__.py +0 -0
  215. wandb/proto/v3/wandb_base_pb2.py +55 -0
  216. wandb/proto/v3/wandb_internal_pb2.py +1728 -0
  217. wandb/proto/v3/wandb_server_pb2.py +228 -0
  218. wandb/proto/v3/wandb_settings_pb2.py +122 -0
  219. wandb/proto/v3/wandb_telemetry_pb2.py +106 -0
  220. wandb/proto/v4/__init__.py +0 -0
  221. wandb/proto/v4/wandb_base_pb2.py +30 -0
  222. wandb/proto/v4/wandb_internal_pb2.py +382 -0
  223. wandb/proto/v4/wandb_server_pb2.py +67 -0
  224. wandb/proto/v4/wandb_settings_pb2.py +47 -0
  225. wandb/proto/v4/wandb_telemetry_pb2.py +41 -0
  226. wandb/proto/v5/wandb_base_pb2.py +31 -0
  227. wandb/proto/v5/wandb_internal_pb2.py +383 -0
  228. wandb/proto/v5/wandb_server_pb2.py +68 -0
  229. wandb/proto/v5/wandb_settings_pb2.py +48 -0
  230. wandb/proto/v5/wandb_telemetry_pb2.py +42 -0
  231. wandb/proto/v6/wandb_base_pb2.py +41 -0
  232. wandb/proto/v6/wandb_internal_pb2.py +393 -0
  233. wandb/proto/v6/wandb_server_pb2.py +78 -0
  234. wandb/proto/v6/wandb_settings_pb2.py +58 -0
  235. wandb/proto/v6/wandb_telemetry_pb2.py +52 -0
  236. wandb/proto/wandb_base_pb2.py +12 -0
  237. wandb/proto/wandb_deprecated.py +59 -0
  238. wandb/proto/wandb_generate_deprecated.py +30 -0
  239. wandb/proto/wandb_generate_proto.py +49 -0
  240. wandb/proto/wandb_internal_pb2.py +18 -0
  241. wandb/proto/wandb_server_pb2.py +12 -0
  242. wandb/proto/wandb_settings_pb2.py +12 -0
  243. wandb/proto/wandb_telemetry_pb2.py +12 -0
  244. wandb/py.typed +0 -0
  245. wandb/sdk/__init__.py +37 -0
  246. wandb/sdk/artifacts/__init__.py +0 -0
  247. wandb/sdk/artifacts/_factories.py +17 -0
  248. wandb/sdk/artifacts/_generated/__init__.py +508 -0
  249. wandb/sdk/artifacts/_generated/add_aliases.py +21 -0
  250. wandb/sdk/artifacts/_generated/artifact_by_id.py +17 -0
  251. wandb/sdk/artifacts/_generated/artifact_by_name.py +22 -0
  252. wandb/sdk/artifacts/_generated/artifact_collection_membership_file_urls.py +43 -0
  253. wandb/sdk/artifacts/_generated/artifact_collection_membership_files.py +43 -0
  254. wandb/sdk/artifacts/_generated/artifact_created_by.py +47 -0
  255. wandb/sdk/artifacts/_generated/artifact_file_urls.py +22 -0
  256. wandb/sdk/artifacts/_generated/artifact_type.py +31 -0
  257. wandb/sdk/artifacts/_generated/artifact_used_by.py +43 -0
  258. wandb/sdk/artifacts/_generated/artifact_version_files.py +36 -0
  259. wandb/sdk/artifacts/_generated/artifact_via_membership_by_name.py +26 -0
  260. wandb/sdk/artifacts/_generated/create_artifact_collection_tag_assignments.py +36 -0
  261. wandb/sdk/artifacts/_generated/delete_aliases.py +21 -0
  262. wandb/sdk/artifacts/_generated/delete_artifact.py +28 -0
  263. wandb/sdk/artifacts/_generated/delete_artifact_collection_tag_assignments.py +25 -0
  264. wandb/sdk/artifacts/_generated/delete_artifact_portfolio.py +35 -0
  265. wandb/sdk/artifacts/_generated/delete_artifact_sequence.py +35 -0
  266. wandb/sdk/artifacts/_generated/enums.py +22 -0
  267. wandb/sdk/artifacts/_generated/fetch_artifact_manifest.py +38 -0
  268. wandb/sdk/artifacts/_generated/fetch_linked_artifacts.py +67 -0
  269. wandb/sdk/artifacts/_generated/fetch_registries.py +32 -0
  270. wandb/sdk/artifacts/_generated/fragments.py +459 -0
  271. wandb/sdk/artifacts/_generated/input_types.py +46 -0
  272. wandb/sdk/artifacts/_generated/link_artifact.py +27 -0
  273. wandb/sdk/artifacts/_generated/move_artifact_collection.py +35 -0
  274. wandb/sdk/artifacts/_generated/operations.py +1223 -0
  275. wandb/sdk/artifacts/_generated/project_artifact_collection.py +101 -0
  276. wandb/sdk/artifacts/_generated/project_artifact_collections.py +33 -0
  277. wandb/sdk/artifacts/_generated/project_artifact_type.py +24 -0
  278. wandb/sdk/artifacts/_generated/project_artifact_types.py +24 -0
  279. wandb/sdk/artifacts/_generated/project_artifacts.py +42 -0
  280. wandb/sdk/artifacts/_generated/registry_collections.py +34 -0
  281. wandb/sdk/artifacts/_generated/registry_versions.py +34 -0
  282. wandb/sdk/artifacts/_generated/run_input_artifacts.py +51 -0
  283. wandb/sdk/artifacts/_generated/run_output_artifacts.py +51 -0
  284. wandb/sdk/artifacts/_generated/unlink_artifact.py +25 -0
  285. wandb/sdk/artifacts/_generated/update_artifact.py +26 -0
  286. wandb/sdk/artifacts/_generated/update_artifact_portfolio.py +35 -0
  287. wandb/sdk/artifacts/_generated/update_artifact_sequence.py +35 -0
  288. wandb/sdk/artifacts/_graphql_fragments.py +19 -0
  289. wandb/sdk/artifacts/_internal_artifact.py +54 -0
  290. wandb/sdk/artifacts/_validators.py +309 -0
  291. wandb/sdk/artifacts/artifact.py +2702 -0
  292. wandb/sdk/artifacts/artifact_download_logger.py +45 -0
  293. wandb/sdk/artifacts/artifact_file_cache.py +251 -0
  294. wandb/sdk/artifacts/artifact_instance_cache.py +17 -0
  295. wandb/sdk/artifacts/artifact_manifest.py +76 -0
  296. wandb/sdk/artifacts/artifact_manifest_entry.py +258 -0
  297. wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
  298. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +94 -0
  299. wandb/sdk/artifacts/artifact_saver.py +277 -0
  300. wandb/sdk/artifacts/artifact_state.py +13 -0
  301. wandb/sdk/artifacts/artifact_ttl.py +9 -0
  302. wandb/sdk/artifacts/exceptions.py +71 -0
  303. wandb/sdk/artifacts/staging.py +27 -0
  304. wandb/sdk/artifacts/storage_handler.py +62 -0
  305. wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
  306. wandb/sdk/artifacts/storage_handlers/azure_handler.py +214 -0
  307. wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
  308. wandb/sdk/artifacts/storage_handlers/http_handler.py +114 -0
  309. wandb/sdk/artifacts/storage_handlers/local_file_handler.py +142 -0
  310. wandb/sdk/artifacts/storage_handlers/multi_handler.py +56 -0
  311. wandb/sdk/artifacts/storage_handlers/s3_handler.py +339 -0
  312. wandb/sdk/artifacts/storage_handlers/tracking_handler.py +68 -0
  313. wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +131 -0
  314. wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +74 -0
  315. wandb/sdk/artifacts/storage_layout.py +8 -0
  316. wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
  317. wandb/sdk/artifacts/storage_policies/register.py +1 -0
  318. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +580 -0
  319. wandb/sdk/artifacts/storage_policy.py +75 -0
  320. wandb/sdk/backend/__init__.py +0 -0
  321. wandb/sdk/backend/backend.py +57 -0
  322. wandb/sdk/data_types/__init__.py +0 -0
  323. wandb/sdk/data_types/_dtypes.py +914 -0
  324. wandb/sdk/data_types/_private.py +10 -0
  325. wandb/sdk/data_types/audio.py +208 -0
  326. wandb/sdk/data_types/base_types/__init__.py +0 -0
  327. wandb/sdk/data_types/base_types/json_metadata.py +55 -0
  328. wandb/sdk/data_types/base_types/media.py +339 -0
  329. wandb/sdk/data_types/base_types/wb_value.py +295 -0
  330. wandb/sdk/data_types/bokeh.py +87 -0
  331. wandb/sdk/data_types/graph.py +439 -0
  332. wandb/sdk/data_types/helper_types/__init__.py +0 -0
  333. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +327 -0
  334. wandb/sdk/data_types/helper_types/classes.py +159 -0
  335. wandb/sdk/data_types/helper_types/image_mask.py +251 -0
  336. wandb/sdk/data_types/histogram.py +107 -0
  337. wandb/sdk/data_types/html.py +165 -0
  338. wandb/sdk/data_types/image.py +974 -0
  339. wandb/sdk/data_types/molecule.py +250 -0
  340. wandb/sdk/data_types/object_3d.py +495 -0
  341. wandb/sdk/data_types/plotly.py +95 -0
  342. wandb/sdk/data_types/saved_model.py +435 -0
  343. wandb/sdk/data_types/table.py +1468 -0
  344. wandb/sdk/data_types/table_decorators.py +108 -0
  345. wandb/sdk/data_types/trace_tree.py +440 -0
  346. wandb/sdk/data_types/utils.py +260 -0
  347. wandb/sdk/data_types/video.py +303 -0
  348. wandb/sdk/integration_utils/__init__.py +0 -0
  349. wandb/sdk/integration_utils/auto_logging.py +232 -0
  350. wandb/sdk/integration_utils/data_logging.py +475 -0
  351. wandb/sdk/interface/__init__.py +0 -0
  352. wandb/sdk/interface/constants.py +4 -0
  353. wandb/sdk/interface/interface.py +1056 -0
  354. wandb/sdk/interface/interface_queue.py +40 -0
  355. wandb/sdk/interface/interface_shared.py +471 -0
  356. wandb/sdk/interface/interface_sock.py +49 -0
  357. wandb/sdk/interface/summary_record.py +67 -0
  358. wandb/sdk/internal/__init__.py +0 -0
  359. wandb/sdk/internal/_generated/__init__.py +15 -0
  360. wandb/sdk/internal/_generated/enums.py +4 -0
  361. wandb/sdk/internal/_generated/input_types.py +4 -0
  362. wandb/sdk/internal/_generated/operations.py +15 -0
  363. wandb/sdk/internal/_generated/server_features_query.py +27 -0
  364. wandb/sdk/internal/context.py +89 -0
  365. wandb/sdk/internal/datastore.py +293 -0
  366. wandb/sdk/internal/file_pusher.py +177 -0
  367. wandb/sdk/internal/file_stream.py +686 -0
  368. wandb/sdk/internal/handler.py +854 -0
  369. wandb/sdk/internal/incremental_table_util.py +53 -0
  370. wandb/sdk/internal/internal_api.py +4723 -0
  371. wandb/sdk/internal/job_builder.py +639 -0
  372. wandb/sdk/internal/profiler.py +79 -0
  373. wandb/sdk/internal/progress.py +77 -0
  374. wandb/sdk/internal/run.py +27 -0
  375. wandb/sdk/internal/sample.py +70 -0
  376. wandb/sdk/internal/sender.py +1692 -0
  377. wandb/sdk/internal/sender_config.py +203 -0
  378. wandb/sdk/internal/settings_static.py +120 -0
  379. wandb/sdk/internal/tb_watcher.py +519 -0
  380. wandb/sdk/internal/thread_local_settings.py +18 -0
  381. wandb/sdk/launch/__init__.py +15 -0
  382. wandb/sdk/launch/_launch.py +331 -0
  383. wandb/sdk/launch/_launch_add.py +255 -0
  384. wandb/sdk/launch/_project_spec.py +565 -0
  385. wandb/sdk/launch/agent/__init__.py +5 -0
  386. wandb/sdk/launch/agent/agent.py +931 -0
  387. wandb/sdk/launch/agent/config.py +296 -0
  388. wandb/sdk/launch/agent/job_status_tracker.py +55 -0
  389. wandb/sdk/launch/agent/run_queue_item_file_saver.py +39 -0
  390. wandb/sdk/launch/builder/__init__.py +0 -0
  391. wandb/sdk/launch/builder/abstract.py +156 -0
  392. wandb/sdk/launch/builder/build.py +296 -0
  393. wandb/sdk/launch/builder/context_manager.py +235 -0
  394. wandb/sdk/launch/builder/docker_builder.py +177 -0
  395. wandb/sdk/launch/builder/kaniko_builder.py +595 -0
  396. wandb/sdk/launch/builder/noop.py +58 -0
  397. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +188 -0
  398. wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
  399. wandb/sdk/launch/create_job.py +541 -0
  400. wandb/sdk/launch/environment/abstract.py +29 -0
  401. wandb/sdk/launch/environment/aws_environment.py +322 -0
  402. wandb/sdk/launch/environment/azure_environment.py +105 -0
  403. wandb/sdk/launch/environment/gcp_environment.py +334 -0
  404. wandb/sdk/launch/environment/local_environment.py +65 -0
  405. wandb/sdk/launch/errors.py +13 -0
  406. wandb/sdk/launch/git_reference.py +109 -0
  407. wandb/sdk/launch/inputs/files.py +148 -0
  408. wandb/sdk/launch/inputs/internal.py +314 -0
  409. wandb/sdk/launch/inputs/manage.py +113 -0
  410. wandb/sdk/launch/inputs/schema.py +40 -0
  411. wandb/sdk/launch/loader.py +249 -0
  412. wandb/sdk/launch/registry/abstract.py +48 -0
  413. wandb/sdk/launch/registry/anon.py +29 -0
  414. wandb/sdk/launch/registry/azure_container_registry.py +124 -0
  415. wandb/sdk/launch/registry/elastic_container_registry.py +192 -0
  416. wandb/sdk/launch/registry/google_artifact_registry.py +219 -0
  417. wandb/sdk/launch/registry/local_registry.py +65 -0
  418. wandb/sdk/launch/runner/__init__.py +0 -0
  419. wandb/sdk/launch/runner/abstract.py +185 -0
  420. wandb/sdk/launch/runner/kubernetes_monitor.py +473 -0
  421. wandb/sdk/launch/runner/kubernetes_runner.py +1285 -0
  422. wandb/sdk/launch/runner/local_container.py +301 -0
  423. wandb/sdk/launch/runner/local_process.py +78 -0
  424. wandb/sdk/launch/runner/sagemaker_runner.py +424 -0
  425. wandb/sdk/launch/runner/vertex_runner.py +225 -0
  426. wandb/sdk/launch/sweeps/__init__.py +37 -0
  427. wandb/sdk/launch/sweeps/scheduler.py +739 -0
  428. wandb/sdk/launch/sweeps/scheduler_sweep.py +90 -0
  429. wandb/sdk/launch/sweeps/utils.py +324 -0
  430. wandb/sdk/launch/utils.py +746 -0
  431. wandb/sdk/launch/wandb_reference.py +138 -0
  432. wandb/sdk/lib/__init__.py +5 -0
  433. wandb/sdk/lib/apikey.py +334 -0
  434. wandb/sdk/lib/asyncio_compat.py +213 -0
  435. wandb/sdk/lib/asyncio_manager.py +252 -0
  436. wandb/sdk/lib/capped_dict.py +26 -0
  437. wandb/sdk/lib/config_util.py +101 -0
  438. wandb/sdk/lib/console_capture.py +219 -0
  439. wandb/sdk/lib/credentials.py +141 -0
  440. wandb/sdk/lib/deprecate.py +27 -0
  441. wandb/sdk/lib/disabled.py +30 -0
  442. wandb/sdk/lib/exit_hooks.py +54 -0
  443. wandb/sdk/lib/file_stream_utils.py +118 -0
  444. wandb/sdk/lib/filenames.py +64 -0
  445. wandb/sdk/lib/filesystem.py +372 -0
  446. wandb/sdk/lib/fsm.py +165 -0
  447. wandb/sdk/lib/gitlib.py +240 -0
  448. wandb/sdk/lib/gql_request.py +65 -0
  449. wandb/sdk/lib/handler_util.py +21 -0
  450. wandb/sdk/lib/hashutil.py +106 -0
  451. wandb/sdk/lib/import_hooks.py +275 -0
  452. wandb/sdk/lib/interrupt.py +37 -0
  453. wandb/sdk/lib/ipython.py +126 -0
  454. wandb/sdk/lib/json_util.py +75 -0
  455. wandb/sdk/lib/lazyloader.py +63 -0
  456. wandb/sdk/lib/module.py +72 -0
  457. wandb/sdk/lib/paths.py +106 -0
  458. wandb/sdk/lib/preinit.py +42 -0
  459. wandb/sdk/lib/printer.py +571 -0
  460. wandb/sdk/lib/printer_asyncio.py +48 -0
  461. wandb/sdk/lib/progress.py +320 -0
  462. wandb/sdk/lib/proto_util.py +90 -0
  463. wandb/sdk/lib/redirect.py +876 -0
  464. wandb/sdk/lib/retry.py +395 -0
  465. wandb/sdk/lib/run_moment.py +82 -0
  466. wandb/sdk/lib/runid.py +12 -0
  467. wandb/sdk/lib/server.py +58 -0
  468. wandb/sdk/lib/service/ipc_support.py +13 -0
  469. wandb/sdk/lib/service/service_client.py +106 -0
  470. wandb/sdk/lib/service/service_connection.py +192 -0
  471. wandb/sdk/lib/service/service_port_file.py +105 -0
  472. wandb/sdk/lib/service/service_process.py +111 -0
  473. wandb/sdk/lib/service/service_token.py +181 -0
  474. wandb/sdk/lib/sparkline.py +44 -0
  475. wandb/sdk/lib/telemetry.py +100 -0
  476. wandb/sdk/lib/timed_input.py +133 -0
  477. wandb/sdk/lib/timer.py +19 -0
  478. wandb/sdk/lib/wb_logging.py +161 -0
  479. wandb/sdk/mailbox/__init__.py +23 -0
  480. wandb/sdk/mailbox/mailbox.py +143 -0
  481. wandb/sdk/mailbox/mailbox_handle.py +132 -0
  482. wandb/sdk/mailbox/response_handle.py +99 -0
  483. wandb/sdk/mailbox/wait_with_progress.py +100 -0
  484. wandb/sdk/projects/_generated/__init__.py +47 -0
  485. wandb/sdk/projects/_generated/delete_project.py +22 -0
  486. wandb/sdk/projects/_generated/enums.py +4 -0
  487. wandb/sdk/projects/_generated/fetch_registry.py +22 -0
  488. wandb/sdk/projects/_generated/fragments.py +41 -0
  489. wandb/sdk/projects/_generated/input_types.py +13 -0
  490. wandb/sdk/projects/_generated/operations.py +88 -0
  491. wandb/sdk/projects/_generated/rename_project.py +27 -0
  492. wandb/sdk/projects/_generated/upsert_registry_project.py +27 -0
  493. wandb/sdk/verify/__init__.py +0 -0
  494. wandb/sdk/verify/verify.py +555 -0
  495. wandb/sdk/wandb_alerts.py +12 -0
  496. wandb/sdk/wandb_config.py +323 -0
  497. wandb/sdk/wandb_helper.py +54 -0
  498. wandb/sdk/wandb_init.py +1581 -0
  499. wandb/sdk/wandb_login.py +332 -0
  500. wandb/sdk/wandb_metric.py +112 -0
  501. wandb/sdk/wandb_require.py +88 -0
  502. wandb/sdk/wandb_require_helpers.py +44 -0
  503. wandb/sdk/wandb_run.py +4088 -0
  504. wandb/sdk/wandb_settings.py +2105 -0
  505. wandb/sdk/wandb_setup.py +560 -0
  506. wandb/sdk/wandb_summary.py +150 -0
  507. wandb/sdk/wandb_sweep.py +120 -0
  508. wandb/sdk/wandb_sync.py +71 -0
  509. wandb/sdk/wandb_watch.py +146 -0
  510. wandb/sklearn.py +35 -0
  511. wandb/sync/__init__.py +3 -0
  512. wandb/sync/sync.py +452 -0
  513. wandb/trigger.py +29 -0
  514. wandb/util.py +2040 -0
  515. wandb/vendor/__init__.py +0 -0
  516. wandb/vendor/gql-0.2.0/setup.py +40 -0
  517. wandb/vendor/gql-0.2.0/tests/__init__.py +0 -0
  518. wandb/vendor/gql-0.2.0/tests/starwars/__init__.py +0 -0
  519. wandb/vendor/gql-0.2.0/tests/starwars/fixtures.py +96 -0
  520. wandb/vendor/gql-0.2.0/tests/starwars/schema.py +146 -0
  521. wandb/vendor/gql-0.2.0/tests/starwars/test_dsl.py +293 -0
  522. wandb/vendor/gql-0.2.0/tests/starwars/test_query.py +355 -0
  523. wandb/vendor/gql-0.2.0/tests/starwars/test_validation.py +171 -0
  524. wandb/vendor/gql-0.2.0/tests/test_client.py +31 -0
  525. wandb/vendor/gql-0.2.0/tests/test_transport.py +89 -0
  526. wandb/vendor/gql-0.2.0/wandb_gql/__init__.py +4 -0
  527. wandb/vendor/gql-0.2.0/wandb_gql/client.py +75 -0
  528. wandb/vendor/gql-0.2.0/wandb_gql/dsl.py +152 -0
  529. wandb/vendor/gql-0.2.0/wandb_gql/gql.py +10 -0
  530. wandb/vendor/gql-0.2.0/wandb_gql/transport/__init__.py +0 -0
  531. wandb/vendor/gql-0.2.0/wandb_gql/transport/http.py +6 -0
  532. wandb/vendor/gql-0.2.0/wandb_gql/transport/local_schema.py +15 -0
  533. wandb/vendor/gql-0.2.0/wandb_gql/transport/requests.py +46 -0
  534. wandb/vendor/gql-0.2.0/wandb_gql/utils.py +21 -0
  535. wandb/vendor/graphql-core-1.1/setup.py +86 -0
  536. wandb/vendor/graphql-core-1.1/wandb_graphql/__init__.py +287 -0
  537. wandb/vendor/graphql-core-1.1/wandb_graphql/error/__init__.py +6 -0
  538. wandb/vendor/graphql-core-1.1/wandb_graphql/error/base.py +42 -0
  539. wandb/vendor/graphql-core-1.1/wandb_graphql/error/format_error.py +11 -0
  540. wandb/vendor/graphql-core-1.1/wandb_graphql/error/located_error.py +29 -0
  541. wandb/vendor/graphql-core-1.1/wandb_graphql/error/syntax_error.py +36 -0
  542. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/__init__.py +26 -0
  543. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/base.py +311 -0
  544. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executor.py +398 -0
  545. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/__init__.py +0 -0
  546. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/asyncio.py +53 -0
  547. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/gevent.py +22 -0
  548. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/process.py +32 -0
  549. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/sync.py +7 -0
  550. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/thread.py +35 -0
  551. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/utils.py +6 -0
  552. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/__init__.py +0 -0
  553. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/executor.py +66 -0
  554. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/fragment.py +252 -0
  555. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/resolver.py +151 -0
  556. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/utils.py +7 -0
  557. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/middleware.py +57 -0
  558. wandb/vendor/graphql-core-1.1/wandb_graphql/execution/values.py +145 -0
  559. wandb/vendor/graphql-core-1.1/wandb_graphql/graphql.py +60 -0
  560. wandb/vendor/graphql-core-1.1/wandb_graphql/language/__init__.py +0 -0
  561. wandb/vendor/graphql-core-1.1/wandb_graphql/language/ast.py +1349 -0
  562. wandb/vendor/graphql-core-1.1/wandb_graphql/language/base.py +19 -0
  563. wandb/vendor/graphql-core-1.1/wandb_graphql/language/lexer.py +435 -0
  564. wandb/vendor/graphql-core-1.1/wandb_graphql/language/location.py +30 -0
  565. wandb/vendor/graphql-core-1.1/wandb_graphql/language/parser.py +779 -0
  566. wandb/vendor/graphql-core-1.1/wandb_graphql/language/printer.py +193 -0
  567. wandb/vendor/graphql-core-1.1/wandb_graphql/language/source.py +18 -0
  568. wandb/vendor/graphql-core-1.1/wandb_graphql/language/visitor.py +222 -0
  569. wandb/vendor/graphql-core-1.1/wandb_graphql/language/visitor_meta.py +82 -0
  570. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/__init__.py +0 -0
  571. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/cached_property.py +17 -0
  572. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/contain_subset.py +28 -0
  573. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/default_ordered_dict.py +40 -0
  574. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/ordereddict.py +8 -0
  575. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/pair_set.py +43 -0
  576. wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/version.py +78 -0
  577. wandb/vendor/graphql-core-1.1/wandb_graphql/type/__init__.py +67 -0
  578. wandb/vendor/graphql-core-1.1/wandb_graphql/type/definition.py +619 -0
  579. wandb/vendor/graphql-core-1.1/wandb_graphql/type/directives.py +132 -0
  580. wandb/vendor/graphql-core-1.1/wandb_graphql/type/introspection.py +440 -0
  581. wandb/vendor/graphql-core-1.1/wandb_graphql/type/scalars.py +131 -0
  582. wandb/vendor/graphql-core-1.1/wandb_graphql/type/schema.py +100 -0
  583. wandb/vendor/graphql-core-1.1/wandb_graphql/type/typemap.py +145 -0
  584. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/__init__.py +0 -0
  585. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/assert_valid_name.py +9 -0
  586. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_from_value.py +65 -0
  587. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_to_code.py +49 -0
  588. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_to_dict.py +24 -0
  589. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/base.py +75 -0
  590. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/build_ast_schema.py +291 -0
  591. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/build_client_schema.py +250 -0
  592. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/concat_ast.py +9 -0
  593. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/extend_schema.py +357 -0
  594. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/get_field_def.py +27 -0
  595. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/get_operation_ast.py +21 -0
  596. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/introspection_query.py +90 -0
  597. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/is_valid_literal_value.py +67 -0
  598. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/is_valid_value.py +66 -0
  599. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/quoted_or_list.py +21 -0
  600. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/schema_printer.py +168 -0
  601. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/suggestion_list.py +56 -0
  602. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_comparators.py +69 -0
  603. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_from_ast.py +21 -0
  604. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_info.py +149 -0
  605. wandb/vendor/graphql-core-1.1/wandb_graphql/utils/value_from_ast.py +69 -0
  606. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/__init__.py +4 -0
  607. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/__init__.py +79 -0
  608. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/arguments_of_correct_type.py +24 -0
  609. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/base.py +8 -0
  610. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/default_values_of_correct_type.py +44 -0
  611. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/fields_on_correct_type.py +113 -0
  612. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/fragments_on_composite_types.py +33 -0
  613. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_argument_names.py +70 -0
  614. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_directives.py +97 -0
  615. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_fragment_names.py +19 -0
  616. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_type_names.py +43 -0
  617. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/lone_anonymous_operation.py +23 -0
  618. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_fragment_cycles.py +59 -0
  619. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_undefined_variables.py +36 -0
  620. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_unused_fragments.py +38 -0
  621. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_unused_variables.py +37 -0
  622. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/overlapping_fields_can_be_merged.py +529 -0
  623. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/possible_fragment_spreads.py +44 -0
  624. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/provided_non_null_arguments.py +46 -0
  625. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/scalar_leafs.py +33 -0
  626. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_argument_names.py +32 -0
  627. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_fragment_names.py +28 -0
  628. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_input_field_names.py +33 -0
  629. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_operation_names.py +31 -0
  630. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_variable_names.py +27 -0
  631. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/variables_are_input_types.py +21 -0
  632. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/variables_in_allowed_position.py +53 -0
  633. wandb/vendor/graphql-core-1.1/wandb_graphql/validation/validation.py +158 -0
  634. wandb/vendor/promise-2.3.0/conftest.py +30 -0
  635. wandb/vendor/promise-2.3.0/setup.py +64 -0
  636. wandb/vendor/promise-2.3.0/tests/__init__.py +0 -0
  637. wandb/vendor/promise-2.3.0/tests/conftest.py +8 -0
  638. wandb/vendor/promise-2.3.0/tests/test_awaitable.py +32 -0
  639. wandb/vendor/promise-2.3.0/tests/test_awaitable_35.py +47 -0
  640. wandb/vendor/promise-2.3.0/tests/test_benchmark.py +116 -0
  641. wandb/vendor/promise-2.3.0/tests/test_complex_threads.py +23 -0
  642. wandb/vendor/promise-2.3.0/tests/test_dataloader.py +452 -0
  643. wandb/vendor/promise-2.3.0/tests/test_dataloader_awaitable_35.py +99 -0
  644. wandb/vendor/promise-2.3.0/tests/test_dataloader_extra.py +65 -0
  645. wandb/vendor/promise-2.3.0/tests/test_extra.py +670 -0
  646. wandb/vendor/promise-2.3.0/tests/test_issues.py +132 -0
  647. wandb/vendor/promise-2.3.0/tests/test_promise_list.py +70 -0
  648. wandb/vendor/promise-2.3.0/tests/test_spec.py +584 -0
  649. wandb/vendor/promise-2.3.0/tests/test_thread_safety.py +115 -0
  650. wandb/vendor/promise-2.3.0/tests/utils.py +3 -0
  651. wandb/vendor/promise-2.3.0/wandb_promise/__init__.py +38 -0
  652. wandb/vendor/promise-2.3.0/wandb_promise/async_.py +135 -0
  653. wandb/vendor/promise-2.3.0/wandb_promise/compat.py +32 -0
  654. wandb/vendor/promise-2.3.0/wandb_promise/dataloader.py +326 -0
  655. wandb/vendor/promise-2.3.0/wandb_promise/iterate_promise.py +12 -0
  656. wandb/vendor/promise-2.3.0/wandb_promise/promise.py +848 -0
  657. wandb/vendor/promise-2.3.0/wandb_promise/promise_list.py +151 -0
  658. wandb/vendor/promise-2.3.0/wandb_promise/pyutils/__init__.py +0 -0
  659. wandb/vendor/promise-2.3.0/wandb_promise/pyutils/version.py +83 -0
  660. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/__init__.py +0 -0
  661. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/asyncio.py +22 -0
  662. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/gevent.py +21 -0
  663. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/immediate.py +27 -0
  664. wandb/vendor/promise-2.3.0/wandb_promise/schedulers/thread.py +18 -0
  665. wandb/vendor/promise-2.3.0/wandb_promise/utils.py +56 -0
  666. wandb/vendor/pygments/__init__.py +90 -0
  667. wandb/vendor/pygments/cmdline.py +568 -0
  668. wandb/vendor/pygments/console.py +74 -0
  669. wandb/vendor/pygments/filter.py +74 -0
  670. wandb/vendor/pygments/filters/__init__.py +350 -0
  671. wandb/vendor/pygments/formatter.py +95 -0
  672. wandb/vendor/pygments/formatters/__init__.py +153 -0
  673. wandb/vendor/pygments/formatters/_mapping.py +85 -0
  674. wandb/vendor/pygments/formatters/bbcode.py +109 -0
  675. wandb/vendor/pygments/formatters/html.py +851 -0
  676. wandb/vendor/pygments/formatters/img.py +600 -0
  677. wandb/vendor/pygments/formatters/irc.py +182 -0
  678. wandb/vendor/pygments/formatters/latex.py +482 -0
  679. wandb/vendor/pygments/formatters/other.py +160 -0
  680. wandb/vendor/pygments/formatters/rtf.py +147 -0
  681. wandb/vendor/pygments/formatters/svg.py +153 -0
  682. wandb/vendor/pygments/formatters/terminal.py +136 -0
  683. wandb/vendor/pygments/formatters/terminal256.py +309 -0
  684. wandb/vendor/pygments/lexer.py +871 -0
  685. wandb/vendor/pygments/lexers/__init__.py +329 -0
  686. wandb/vendor/pygments/lexers/_asy_builtins.py +1645 -0
  687. wandb/vendor/pygments/lexers/_cl_builtins.py +232 -0
  688. wandb/vendor/pygments/lexers/_cocoa_builtins.py +72 -0
  689. wandb/vendor/pygments/lexers/_csound_builtins.py +1346 -0
  690. wandb/vendor/pygments/lexers/_lasso_builtins.py +5327 -0
  691. wandb/vendor/pygments/lexers/_lua_builtins.py +295 -0
  692. wandb/vendor/pygments/lexers/_mapping.py +500 -0
  693. wandb/vendor/pygments/lexers/_mql_builtins.py +1172 -0
  694. wandb/vendor/pygments/lexers/_openedge_builtins.py +2547 -0
  695. wandb/vendor/pygments/lexers/_php_builtins.py +4756 -0
  696. wandb/vendor/pygments/lexers/_postgres_builtins.py +621 -0
  697. wandb/vendor/pygments/lexers/_scilab_builtins.py +3094 -0
  698. wandb/vendor/pygments/lexers/_sourcemod_builtins.py +1163 -0
  699. wandb/vendor/pygments/lexers/_stan_builtins.py +532 -0
  700. wandb/vendor/pygments/lexers/_stata_builtins.py +419 -0
  701. wandb/vendor/pygments/lexers/_tsql_builtins.py +1004 -0
  702. wandb/vendor/pygments/lexers/_vim_builtins.py +1939 -0
  703. wandb/vendor/pygments/lexers/actionscript.py +240 -0
  704. wandb/vendor/pygments/lexers/agile.py +24 -0
  705. wandb/vendor/pygments/lexers/algebra.py +221 -0
  706. wandb/vendor/pygments/lexers/ambient.py +76 -0
  707. wandb/vendor/pygments/lexers/ampl.py +87 -0
  708. wandb/vendor/pygments/lexers/apl.py +101 -0
  709. wandb/vendor/pygments/lexers/archetype.py +318 -0
  710. wandb/vendor/pygments/lexers/asm.py +641 -0
  711. wandb/vendor/pygments/lexers/automation.py +374 -0
  712. wandb/vendor/pygments/lexers/basic.py +500 -0
  713. wandb/vendor/pygments/lexers/bibtex.py +160 -0
  714. wandb/vendor/pygments/lexers/business.py +612 -0
  715. wandb/vendor/pygments/lexers/c_cpp.py +252 -0
  716. wandb/vendor/pygments/lexers/c_like.py +541 -0
  717. wandb/vendor/pygments/lexers/capnproto.py +78 -0
  718. wandb/vendor/pygments/lexers/chapel.py +102 -0
  719. wandb/vendor/pygments/lexers/clean.py +288 -0
  720. wandb/vendor/pygments/lexers/compiled.py +34 -0
  721. wandb/vendor/pygments/lexers/configs.py +833 -0
  722. wandb/vendor/pygments/lexers/console.py +114 -0
  723. wandb/vendor/pygments/lexers/crystal.py +393 -0
  724. wandb/vendor/pygments/lexers/csound.py +366 -0
  725. wandb/vendor/pygments/lexers/css.py +689 -0
  726. wandb/vendor/pygments/lexers/d.py +251 -0
  727. wandb/vendor/pygments/lexers/dalvik.py +125 -0
  728. wandb/vendor/pygments/lexers/data.py +555 -0
  729. wandb/vendor/pygments/lexers/diff.py +165 -0
  730. wandb/vendor/pygments/lexers/dotnet.py +691 -0
  731. wandb/vendor/pygments/lexers/dsls.py +878 -0
  732. wandb/vendor/pygments/lexers/dylan.py +289 -0
  733. wandb/vendor/pygments/lexers/ecl.py +125 -0
  734. wandb/vendor/pygments/lexers/eiffel.py +65 -0
  735. wandb/vendor/pygments/lexers/elm.py +121 -0
  736. wandb/vendor/pygments/lexers/erlang.py +533 -0
  737. wandb/vendor/pygments/lexers/esoteric.py +277 -0
  738. wandb/vendor/pygments/lexers/ezhil.py +69 -0
  739. wandb/vendor/pygments/lexers/factor.py +344 -0
  740. wandb/vendor/pygments/lexers/fantom.py +250 -0
  741. wandb/vendor/pygments/lexers/felix.py +273 -0
  742. wandb/vendor/pygments/lexers/forth.py +177 -0
  743. wandb/vendor/pygments/lexers/fortran.py +205 -0
  744. wandb/vendor/pygments/lexers/foxpro.py +428 -0
  745. wandb/vendor/pygments/lexers/functional.py +21 -0
  746. wandb/vendor/pygments/lexers/go.py +101 -0
  747. wandb/vendor/pygments/lexers/grammar_notation.py +213 -0
  748. wandb/vendor/pygments/lexers/graph.py +80 -0
  749. wandb/vendor/pygments/lexers/graphics.py +553 -0
  750. wandb/vendor/pygments/lexers/haskell.py +843 -0
  751. wandb/vendor/pygments/lexers/haxe.py +936 -0
  752. wandb/vendor/pygments/lexers/hdl.py +382 -0
  753. wandb/vendor/pygments/lexers/hexdump.py +103 -0
  754. wandb/vendor/pygments/lexers/html.py +602 -0
  755. wandb/vendor/pygments/lexers/idl.py +270 -0
  756. wandb/vendor/pygments/lexers/igor.py +288 -0
  757. wandb/vendor/pygments/lexers/inferno.py +96 -0
  758. wandb/vendor/pygments/lexers/installers.py +322 -0
  759. wandb/vendor/pygments/lexers/int_fiction.py +1343 -0
  760. wandb/vendor/pygments/lexers/iolang.py +63 -0
  761. wandb/vendor/pygments/lexers/j.py +146 -0
  762. wandb/vendor/pygments/lexers/javascript.py +1525 -0
  763. wandb/vendor/pygments/lexers/julia.py +333 -0
  764. wandb/vendor/pygments/lexers/jvm.py +1573 -0
  765. wandb/vendor/pygments/lexers/lisp.py +2621 -0
  766. wandb/vendor/pygments/lexers/make.py +202 -0
  767. wandb/vendor/pygments/lexers/markup.py +595 -0
  768. wandb/vendor/pygments/lexers/math.py +21 -0
  769. wandb/vendor/pygments/lexers/matlab.py +663 -0
  770. wandb/vendor/pygments/lexers/ml.py +769 -0
  771. wandb/vendor/pygments/lexers/modeling.py +358 -0
  772. wandb/vendor/pygments/lexers/modula2.py +1561 -0
  773. wandb/vendor/pygments/lexers/monte.py +204 -0
  774. wandb/vendor/pygments/lexers/ncl.py +894 -0
  775. wandb/vendor/pygments/lexers/nimrod.py +159 -0
  776. wandb/vendor/pygments/lexers/nit.py +64 -0
  777. wandb/vendor/pygments/lexers/nix.py +136 -0
  778. wandb/vendor/pygments/lexers/oberon.py +105 -0
  779. wandb/vendor/pygments/lexers/objective.py +504 -0
  780. wandb/vendor/pygments/lexers/ooc.py +85 -0
  781. wandb/vendor/pygments/lexers/other.py +41 -0
  782. wandb/vendor/pygments/lexers/parasail.py +79 -0
  783. wandb/vendor/pygments/lexers/parsers.py +835 -0
  784. wandb/vendor/pygments/lexers/pascal.py +644 -0
  785. wandb/vendor/pygments/lexers/pawn.py +199 -0
  786. wandb/vendor/pygments/lexers/perl.py +620 -0
  787. wandb/vendor/pygments/lexers/php.py +267 -0
  788. wandb/vendor/pygments/lexers/praat.py +294 -0
  789. wandb/vendor/pygments/lexers/prolog.py +306 -0
  790. wandb/vendor/pygments/lexers/python.py +939 -0
  791. wandb/vendor/pygments/lexers/qvt.py +152 -0
  792. wandb/vendor/pygments/lexers/r.py +453 -0
  793. wandb/vendor/pygments/lexers/rdf.py +270 -0
  794. wandb/vendor/pygments/lexers/rebol.py +431 -0
  795. wandb/vendor/pygments/lexers/resource.py +85 -0
  796. wandb/vendor/pygments/lexers/rnc.py +67 -0
  797. wandb/vendor/pygments/lexers/roboconf.py +82 -0
  798. wandb/vendor/pygments/lexers/robotframework.py +560 -0
  799. wandb/vendor/pygments/lexers/ruby.py +519 -0
  800. wandb/vendor/pygments/lexers/rust.py +220 -0
  801. wandb/vendor/pygments/lexers/sas.py +228 -0
  802. wandb/vendor/pygments/lexers/scripting.py +1222 -0
  803. wandb/vendor/pygments/lexers/shell.py +794 -0
  804. wandb/vendor/pygments/lexers/smalltalk.py +195 -0
  805. wandb/vendor/pygments/lexers/smv.py +79 -0
  806. wandb/vendor/pygments/lexers/snobol.py +83 -0
  807. wandb/vendor/pygments/lexers/special.py +103 -0
  808. wandb/vendor/pygments/lexers/sql.py +681 -0
  809. wandb/vendor/pygments/lexers/stata.py +108 -0
  810. wandb/vendor/pygments/lexers/supercollider.py +90 -0
  811. wandb/vendor/pygments/lexers/tcl.py +145 -0
  812. wandb/vendor/pygments/lexers/templates.py +2283 -0
  813. wandb/vendor/pygments/lexers/testing.py +207 -0
  814. wandb/vendor/pygments/lexers/text.py +25 -0
  815. wandb/vendor/pygments/lexers/textedit.py +169 -0
  816. wandb/vendor/pygments/lexers/textfmts.py +297 -0
  817. wandb/vendor/pygments/lexers/theorem.py +458 -0
  818. wandb/vendor/pygments/lexers/trafficscript.py +54 -0
  819. wandb/vendor/pygments/lexers/typoscript.py +226 -0
  820. wandb/vendor/pygments/lexers/urbi.py +133 -0
  821. wandb/vendor/pygments/lexers/varnish.py +190 -0
  822. wandb/vendor/pygments/lexers/verification.py +111 -0
  823. wandb/vendor/pygments/lexers/web.py +24 -0
  824. wandb/vendor/pygments/lexers/webmisc.py +988 -0
  825. wandb/vendor/pygments/lexers/whiley.py +116 -0
  826. wandb/vendor/pygments/lexers/x10.py +69 -0
  827. wandb/vendor/pygments/modeline.py +44 -0
  828. wandb/vendor/pygments/plugin.py +68 -0
  829. wandb/vendor/pygments/regexopt.py +92 -0
  830. wandb/vendor/pygments/scanner.py +105 -0
  831. wandb/vendor/pygments/sphinxext.py +158 -0
  832. wandb/vendor/pygments/style.py +155 -0
  833. wandb/vendor/pygments/styles/__init__.py +80 -0
  834. wandb/vendor/pygments/styles/abap.py +29 -0
  835. wandb/vendor/pygments/styles/algol.py +63 -0
  836. wandb/vendor/pygments/styles/algol_nu.py +63 -0
  837. wandb/vendor/pygments/styles/arduino.py +98 -0
  838. wandb/vendor/pygments/styles/autumn.py +65 -0
  839. wandb/vendor/pygments/styles/borland.py +51 -0
  840. wandb/vendor/pygments/styles/bw.py +49 -0
  841. wandb/vendor/pygments/styles/colorful.py +81 -0
  842. wandb/vendor/pygments/styles/default.py +73 -0
  843. wandb/vendor/pygments/styles/emacs.py +72 -0
  844. wandb/vendor/pygments/styles/friendly.py +72 -0
  845. wandb/vendor/pygments/styles/fruity.py +42 -0
  846. wandb/vendor/pygments/styles/igor.py +29 -0
  847. wandb/vendor/pygments/styles/lovelace.py +97 -0
  848. wandb/vendor/pygments/styles/manni.py +75 -0
  849. wandb/vendor/pygments/styles/monokai.py +106 -0
  850. wandb/vendor/pygments/styles/murphy.py +80 -0
  851. wandb/vendor/pygments/styles/native.py +65 -0
  852. wandb/vendor/pygments/styles/paraiso_dark.py +125 -0
  853. wandb/vendor/pygments/styles/paraiso_light.py +125 -0
  854. wandb/vendor/pygments/styles/pastie.py +75 -0
  855. wandb/vendor/pygments/styles/perldoc.py +69 -0
  856. wandb/vendor/pygments/styles/rainbow_dash.py +89 -0
  857. wandb/vendor/pygments/styles/rrt.py +33 -0
  858. wandb/vendor/pygments/styles/sas.py +44 -0
  859. wandb/vendor/pygments/styles/stata.py +40 -0
  860. wandb/vendor/pygments/styles/tango.py +141 -0
  861. wandb/vendor/pygments/styles/trac.py +63 -0
  862. wandb/vendor/pygments/styles/vim.py +63 -0
  863. wandb/vendor/pygments/styles/vs.py +38 -0
  864. wandb/vendor/pygments/styles/xcode.py +51 -0
  865. wandb/vendor/pygments/token.py +213 -0
  866. wandb/vendor/pygments/unistring.py +217 -0
  867. wandb/vendor/pygments/util.py +388 -0
  868. wandb/vendor/watchdog_0_9_0/wandb_watchdog/__init__.py +17 -0
  869. wandb/vendor/watchdog_0_9_0/wandb_watchdog/events.py +615 -0
  870. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/__init__.py +98 -0
  871. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/api.py +369 -0
  872. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/fsevents.py +172 -0
  873. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/fsevents2.py +239 -0
  874. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify.py +218 -0
  875. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify_buffer.py +81 -0
  876. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify_c.py +575 -0
  877. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/kqueue.py +730 -0
  878. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/polling.py +145 -0
  879. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/read_directory_changes.py +133 -0
  880. wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/winapi.py +348 -0
  881. wandb/vendor/watchdog_0_9_0/wandb_watchdog/patterns.py +265 -0
  882. wandb/vendor/watchdog_0_9_0/wandb_watchdog/tricks/__init__.py +174 -0
  883. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/__init__.py +151 -0
  884. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/bricks.py +249 -0
  885. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/compat.py +29 -0
  886. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/decorators.py +198 -0
  887. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/delayed_queue.py +88 -0
  888. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/dirsnapshot.py +293 -0
  889. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/echo.py +157 -0
  890. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/event_backport.py +41 -0
  891. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/importlib2.py +40 -0
  892. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/platform.py +57 -0
  893. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/unicode_paths.py +64 -0
  894. wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/win32stat.py +123 -0
  895. wandb/vendor/watchdog_0_9_0/wandb_watchdog/version.py +28 -0
  896. wandb/vendor/watchdog_0_9_0/wandb_watchdog/watchmedo.py +577 -0
  897. wandb/wandb_agent.py +580 -0
  898. wandb/wandb_controller.py +719 -0
  899. wandb/wandb_run.py +8 -0
  900. wandb-0.21.2.dist-info/METADATA +223 -0
  901. wandb-0.21.2.dist-info/RECORD +904 -0
  902. wandb-0.21.2.dist-info/WHEEL +4 -0
  903. wandb-0.21.2.dist-info/entry_points.txt +3 -0
  904. wandb-0.21.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1285 @@
1
+ """Implementation of KubernetesRunner class for wandb launch."""
2
+
3
+ import asyncio
4
+ import base64
5
+ import datetime
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
11
+
12
+ import yaml
13
+
14
+ import wandb
15
+ from wandb.apis.internal import Api
16
+ from wandb.sdk.launch.agent.agent import LaunchAgent
17
+ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
18
+ from wandb.sdk.launch.registry.abstract import AbstractRegistry
19
+ from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
20
+ from wandb.sdk.launch.registry.local_registry import LocalRegistry
21
+ from wandb.sdk.launch.runner.abstract import Status
22
+ from wandb.sdk.launch.runner.kubernetes_monitor import (
23
+ WANDB_K8S_LABEL_AGENT,
24
+ WANDB_K8S_LABEL_AUXILIARY_RESOURCE,
25
+ WANDB_K8S_LABEL_MONITOR,
26
+ WANDB_K8S_RUN_ID,
27
+ CustomResource,
28
+ LaunchKubernetesMonitor,
29
+ )
30
+ from wandb.sdk.launch.utils import recursive_macro_sub
31
+ from wandb.sdk.lib.retry import ExponentialBackoff, retry_async
32
+ from wandb.util import get_module
33
+
34
+ from .._project_spec import EntryPoint, LaunchProject
35
+ from ..errors import LaunchError
36
+ from ..utils import (
37
+ CODE_MOUNT_DIR,
38
+ LOG_PREFIX,
39
+ MAX_ENV_LENGTHS,
40
+ PROJECT_SYNCHRONOUS,
41
+ get_kube_context_and_api_client,
42
+ make_name_dns_safe,
43
+ )
44
+ from .abstract import AbstractRun, AbstractRunner
45
+
46
+ get_module(
47
+ "kubernetes_asyncio",
48
+ required="Kubernetes runner requires the kubernetes package. Please install it with `pip install wandb[launch]`.",
49
+ )
50
+
51
+ import kubernetes_asyncio # type: ignore # noqa: E402
52
+ from kubernetes_asyncio import client # noqa: E402
53
+ from kubernetes_asyncio.client.api.apps_v1_api import ( # type: ignore # noqa: E402
54
+ AppsV1Api,
55
+ )
56
+ from kubernetes_asyncio.client.api.batch_v1_api import ( # type: ignore # noqa: E402
57
+ BatchV1Api,
58
+ )
59
+ from kubernetes_asyncio.client.api.core_v1_api import ( # type: ignore # noqa: E402
60
+ CoreV1Api,
61
+ )
62
+ from kubernetes_asyncio.client.api.custom_objects_api import ( # type: ignore # noqa: E402
63
+ CustomObjectsApi,
64
+ )
65
+ from kubernetes_asyncio.client.api.networking_v1_api import ( # type: ignore # noqa: E402
66
+ NetworkingV1Api,
67
+ )
68
+ from kubernetes_asyncio.client.models.v1_secret import ( # type: ignore # noqa: E402
69
+ V1Secret,
70
+ )
71
+ from kubernetes_asyncio.client.rest import ApiException # type: ignore # noqa: E402
72
+
73
+ TIMEOUT = 5
74
+ API_KEY_SECRET_MAX_RETRIES = 5
75
+
76
+ _logger = logging.getLogger(__name__)
77
+
78
+
79
+ SOURCE_CODE_PVC_MOUNT_PATH = os.environ.get("WANDB_LAUNCH_CODE_PVC_MOUNT_PATH")
80
+ SOURCE_CODE_PVC_NAME = os.environ.get("WANDB_LAUNCH_CODE_PVC_NAME")
81
+
82
+
83
+ class KubernetesSubmittedRun(AbstractRun):
84
+ """Wrapper for a launched run on Kubernetes."""
85
+
86
+ def __init__(
87
+ self,
88
+ batch_api: "BatchV1Api",
89
+ core_api: "CoreV1Api",
90
+ apps_api: "AppsV1Api",
91
+ network_api: "NetworkingV1Api",
92
+ name: str,
93
+ namespace: Optional[str] = "default",
94
+ secret: Optional["V1Secret"] = None,
95
+ auxiliary_resource_label_key: Optional[str] = None,
96
+ ) -> None:
97
+ """Initialize a KubernetesSubmittedRun.
98
+
99
+ Other implementations of the AbstractRun interface poll on the run
100
+ when `get_status` is called, but KubernetesSubmittedRun uses
101
+ Kubernetes watch streams to update the run status. One thread handles
102
+ events from the job object and another thread handles events from the
103
+ rank 0 pod. These threads updated the `_status` attributed of the
104
+ KubernetesSubmittedRun object. When `get_status` is called, the
105
+ `_status` attribute is returned.
106
+
107
+ Arguments:
108
+ batch_api: Kubernetes BatchV1Api object.
109
+ core_api: Kubernetes CoreV1Api object.
110
+ network_api: Kubernetes NetworkV1Api object.
111
+ name: Name of the job.
112
+ namespace: Kubernetes namespace.
113
+ secret: Kubernetes secret.
114
+
115
+ Returns:
116
+ None.
117
+ """
118
+ self.batch_api = batch_api
119
+ self.core_api = core_api
120
+ self.apps_api = apps_api
121
+ self.network_api = network_api
122
+ self.name = name
123
+ self.namespace = namespace
124
+ self._fail_count = 0
125
+ self.secret = secret
126
+ self.auxiliary_resource_label_key = auxiliary_resource_label_key
127
+
128
+ @property
129
+ def id(self) -> str:
130
+ """Return the run id."""
131
+ return self.name
132
+
133
+ async def get_logs(self) -> Optional[str]:
134
+ try:
135
+ pods = await self.core_api.list_namespaced_pod(
136
+ label_selector=f"job-name={self.name}", namespace=self.namespace
137
+ )
138
+ pod_names = [pi.metadata.name for pi in pods.items]
139
+ if not pod_names:
140
+ wandb.termwarn(f"Found no pods for kubernetes job: {self.name}")
141
+ return None
142
+ logs = await self.core_api.read_namespaced_pod_log(
143
+ name=pod_names[0], namespace=self.namespace
144
+ )
145
+ if logs:
146
+ return str(logs)
147
+ else:
148
+ wandb.termwarn(f"No logs for kubernetes pod(s): {pod_names}")
149
+ return None
150
+ except Exception as e:
151
+ wandb.termerror(f"{LOG_PREFIX}Failed to get pod logs: {e}")
152
+ return None
153
+
154
+ async def wait(self) -> bool:
155
+ """Wait for the run to finish.
156
+
157
+ Returns:
158
+ True if the run finished successfully, False otherwise.
159
+ """
160
+ while True:
161
+ status = await self.get_status()
162
+ wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status.state}")
163
+ if status.state in ["finished", "failed", "preempted"]:
164
+ break
165
+ await asyncio.sleep(5)
166
+
167
+ await self._delete_secret()
168
+ await self._delete_auxiliary_resources_by_label()
169
+ return (
170
+ status.state == "finished"
171
+ ) # todo: not sure if this (copied from aws runner) is the right approach? should we return false on failure
172
+
173
+ async def get_status(self) -> Status:
174
+ status = LaunchKubernetesMonitor.get_status(self.name)
175
+ if status in ["stopped", "failed", "finished", "preempted"]:
176
+ await self._delete_secret()
177
+ await self._delete_auxiliary_resources_by_label()
178
+ return status
179
+
180
+ async def cancel(self) -> None:
181
+ """Cancel the run."""
182
+ try:
183
+ await self.batch_api.delete_namespaced_job(
184
+ namespace=self.namespace,
185
+ name=self.name,
186
+ )
187
+ await self._delete_secret()
188
+ await self._delete_auxiliary_resources_by_label()
189
+ except ApiException as e:
190
+ raise LaunchError(
191
+ f"Failed to delete Kubernetes Job {self.name} in namespace {self.namespace}: {str(e)}"
192
+ ) from e
193
+
194
+ async def _delete_secret(self) -> None:
195
+ # Cleanup secret if not running in a helm-managed context
196
+ if not os.environ.get("WANDB_RELEASE_NAME") and self.secret:
197
+ await self.core_api.delete_namespaced_secret(
198
+ name=self.secret.metadata.name,
199
+ namespace=self.secret.metadata.namespace,
200
+ )
201
+ self.secret = None
202
+
203
+ async def _delete_auxiliary_resources_by_label(self) -> None:
204
+ if self.auxiliary_resource_label_key is None:
205
+ return
206
+
207
+ label_selector = (
208
+ f"{WANDB_K8S_LABEL_AUXILIARY_RESOURCE}={self.auxiliary_resource_label_key}"
209
+ )
210
+
211
+ try:
212
+ resource_cleanups = [
213
+ (self.core_api, "service"),
214
+ (self.batch_api, "job"),
215
+ (self.core_api, "pod"),
216
+ (self.core_api, "secret"),
217
+ (self.apps_api, "deployment"),
218
+ (self.network_api, "network_policy"),
219
+ ]
220
+
221
+ for api_client, resource_type in resource_cleanups:
222
+ try:
223
+ list_method = getattr(
224
+ api_client, f"list_namespaced_{resource_type}"
225
+ )
226
+ delete_method = getattr(
227
+ api_client, f"delete_namespaced_{resource_type}"
228
+ )
229
+
230
+ # List resources with our label
231
+ resources = await list_method(
232
+ namespace=self.namespace, label_selector=label_selector
233
+ )
234
+
235
+ # Delete each resource
236
+ for resource in resources.items:
237
+ await delete_method(
238
+ name=resource.metadata.name, namespace=self.namespace
239
+ )
240
+
241
+ except (AttributeError, ApiException) as e:
242
+ wandb.termwarn(f"Could not clean up {resource_type}: {e}")
243
+
244
+ except Exception as e:
245
+ wandb.termwarn(f"Failed to clean up some auxiliary resources: {e}")
246
+
247
+
248
+ class CrdSubmittedRun(AbstractRun):
249
+ """Run submitted to a CRD backend, e.g. Volcano."""
250
+
251
+ def __init__(
252
+ self,
253
+ group: str,
254
+ version: str,
255
+ plural: str,
256
+ name: str,
257
+ namespace: str,
258
+ core_api: CoreV1Api,
259
+ custom_api: CustomObjectsApi,
260
+ ) -> None:
261
+ """Create a run object for tracking the progress of a CRD.
262
+
263
+ Arguments:
264
+ group: The API group of the CRD.
265
+ version: The API version of the CRD.
266
+ plural: The plural name of the CRD.
267
+ name: The name of the CRD instance.
268
+ namespace: The namespace of the CRD instance.
269
+ core_api: The Kubernetes core API client.
270
+ custom_api: The Kubernetes custom object API client.
271
+
272
+ Raises:
273
+ LaunchError: If the CRD instance does not exist.
274
+ """
275
+ self.group = group
276
+ self.version = version
277
+ self.plural = plural
278
+ self.name = name
279
+ self.namespace = namespace
280
+ self.core_api = core_api
281
+ self.custom_api = custom_api
282
+ self._fail_count = 0
283
+
284
+ @property
285
+ def id(self) -> str:
286
+ """Get the name of the custom object."""
287
+ return self.name
288
+
289
+ async def get_logs(self) -> Optional[str]:
290
+ """Get logs for custom object."""
291
+ # TODO: test more carefully once we release multi-node support
292
+ logs: Dict[str, Optional[str]] = {}
293
+ try:
294
+ pods = await self.core_api.list_namespaced_pod(
295
+ label_selector=f"wandb/run-id={self.name}", namespace=self.namespace
296
+ )
297
+ pod_names = [pi.metadata.name for pi in pods.items]
298
+ for pod_name in pod_names:
299
+ logs[pod_name] = await self.core_api.read_namespaced_pod_log(
300
+ name=pod_name, namespace=self.namespace
301
+ )
302
+ except ApiException as e:
303
+ wandb.termwarn(f"Failed to get logs for {self.name}: {str(e)}")
304
+ return None
305
+ if not logs:
306
+ return None
307
+ logs_as_array = [f"Pod {pod_name}:\n{log}" for pod_name, log in logs.items()]
308
+ return "\n".join(logs_as_array)
309
+
310
+ async def get_status(self) -> Status:
311
+ """Get status of custom object."""
312
+ return LaunchKubernetesMonitor.get_status(self.name)
313
+
314
+ async def cancel(self) -> None:
315
+ """Cancel the custom object."""
316
+ try:
317
+ await self.custom_api.delete_namespaced_custom_object(
318
+ group=self.group,
319
+ version=self.version,
320
+ namespace=self.namespace,
321
+ plural=self.plural,
322
+ name=self.name,
323
+ )
324
+ except ApiException as e:
325
+ raise LaunchError(
326
+ f"Failed to delete CRD {self.name} in namespace {self.namespace}: {str(e)}"
327
+ ) from e
328
+
329
+ async def wait(self) -> bool:
330
+ """Wait for this custom object to finish running."""
331
+ while True:
332
+ status = await self.get_status()
333
+ wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status}")
334
+ if status.state in ["finished", "failed", "preempted"]:
335
+ return status.state == "finished"
336
+ await asyncio.sleep(5)
337
+
338
+
339
+ class KubernetesRunner(AbstractRunner):
340
+ """Launches runs onto kubernetes."""
341
+
342
+ def __init__(
343
+ self,
344
+ api: Api,
345
+ backend_config: Dict[str, Any],
346
+ environment: AbstractEnvironment,
347
+ registry: AbstractRegistry,
348
+ ) -> None:
349
+ """Create a Kubernetes runner.
350
+
351
+ Arguments:
352
+ api: The API client object.
353
+ backend_config: The backend configuration.
354
+ environment: The environment to launch runs into.
355
+
356
+ Raises:
357
+ LaunchError: If the Kubernetes configuration is invalid.
358
+ """
359
+ super().__init__(api, backend_config)
360
+ self.environment = environment
361
+ self.registry = registry
362
+
363
+ def get_namespace(
364
+ self, resource_args: Dict[str, Any], context: Dict[str, Any]
365
+ ) -> str:
366
+ """Get the namespace to launch into.
367
+
368
+ Arguments:
369
+ resource_args: The resource args to launch.
370
+ context: The k8s config context.
371
+
372
+ Returns:
373
+ The namespace to launch into.
374
+ """
375
+ default_namespace = (
376
+ context["context"].get("namespace", "default") if context else "default"
377
+ )
378
+ return ( # type: ignore[no-any-return]
379
+ resource_args.get("metadata", {}).get("namespace")
380
+ or resource_args.get(
381
+ "namespace"
382
+ ) # continue support for malformed namespace
383
+ or self.backend_config.get("runner", {}).get("namespace")
384
+ or default_namespace
385
+ )
386
+
387
+ async def _inject_defaults(
388
+ self,
389
+ resource_args: Dict[str, Any],
390
+ launch_project: LaunchProject,
391
+ image_uri: str,
392
+ namespace: str,
393
+ core_api: "CoreV1Api",
394
+ ) -> Tuple[Dict[str, Any], Optional["V1Secret"]]:
395
+ """Apply our default values, return job dict and api key secret.
396
+
397
+ Arguments:
398
+ resource_args (Dict[str, Any]): The resource args to launch.
399
+ launch_project (LaunchProject): The launch project.
400
+ builder (Optional[AbstractBuilder]): The builder.
401
+ namespace (str): The namespace.
402
+ core_api (CoreV1Api): The core api.
403
+
404
+ Returns:
405
+ Tuple[Dict[str, Any], Optional["V1Secret"]]: The resource args and api key secret.
406
+ """
407
+ job: Dict[str, Any] = {
408
+ "apiVersion": "batch/v1",
409
+ "kind": "Job",
410
+ }
411
+ job.update(resource_args)
412
+
413
+ job_metadata: Dict[str, Any] = job.get("metadata", {})
414
+ job_spec: Dict[str, Any] = {"backoffLimit": 0, "ttlSecondsAfterFinished": 60}
415
+ job_spec.update(job.get("spec", {}))
416
+ pod_template: Dict[str, Any] = job_spec.get("template", {})
417
+ pod_spec: Dict[str, Any] = {"restartPolicy": "Never"}
418
+ pod_spec.update(pod_template.get("spec", {}))
419
+ containers: List[Dict[str, Any]] = pod_spec.get("containers", [{}])
420
+
421
+ # Add labels to job metadata
422
+ job_metadata.setdefault("labels", {})
423
+ job_metadata["labels"][WANDB_K8S_RUN_ID] = launch_project.run_id
424
+ job_metadata["labels"][WANDB_K8S_LABEL_MONITOR] = "true"
425
+ if LaunchAgent.initialized():
426
+ job_metadata["labels"][WANDB_K8S_LABEL_AGENT] = LaunchAgent.name()
427
+ # name precedence: name in spec > generated name
428
+ if not job_metadata.get("name"):
429
+ job_metadata["generateName"] = make_name_dns_safe(
430
+ f"launch-{launch_project.target_entity}-{launch_project.target_project}-"
431
+ )
432
+ job_metadata["namespace"] = namespace
433
+
434
+ for i, cont in enumerate(containers):
435
+ if "name" not in cont:
436
+ cont["name"] = cont.get("name", "launch" + str(i))
437
+ if "securityContext" not in cont:
438
+ cont["securityContext"] = {
439
+ "allowPrivilegeEscalation": False,
440
+ "capabilities": {"drop": ["ALL"]},
441
+ "seccompProfile": {"type": "RuntimeDefault"},
442
+ }
443
+
444
+ entry_point = (
445
+ launch_project.override_entrypoint or launch_project.get_job_entry_point()
446
+ )
447
+ if launch_project.docker_image:
448
+ # dont specify run id if user provided image, could have multiple runs
449
+ containers[0]["image"] = image_uri
450
+ # TODO: handle secret pulling image from registry
451
+ elif not any(["image" in cont for cont in containers]):
452
+ assert entry_point is not None
453
+ # in the non instance case we need to make an imagePullSecret
454
+ # so the new job can pull the image
455
+ containers[0]["image"] = image_uri
456
+ secret = await maybe_create_imagepull_secret(
457
+ core_api, self.registry, launch_project.run_id, namespace
458
+ )
459
+ if secret is not None:
460
+ pod_spec["imagePullSecrets"] = [
461
+ {"name": f"regcred-{launch_project.run_id}"}
462
+ ]
463
+
464
+ inject_entrypoint_and_args(
465
+ containers,
466
+ entry_point,
467
+ launch_project.override_args,
468
+ launch_project.override_entrypoint is not None,
469
+ )
470
+
471
+ env_vars = launch_project.get_env_vars_dict(
472
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
473
+ )
474
+ api_key_secret = None
475
+ for cont in containers:
476
+ # Add our env vars to user supplied env vars
477
+ env = cont.get("env") or []
478
+ for key, value in env_vars.items():
479
+ if (
480
+ key == "WANDB_API_KEY"
481
+ and value
482
+ and (
483
+ LaunchAgent.initialized()
484
+ or self.backend_config[PROJECT_SYNCHRONOUS]
485
+ )
486
+ ):
487
+ # Override API key with secret. TODO: Do the same for other runners
488
+ release_name = os.environ.get("WANDB_RELEASE_NAME")
489
+ secret_name = "wandb-api-key"
490
+ if release_name:
491
+ secret_name += f"-{release_name}"
492
+ else:
493
+ secret_name += f"-{launch_project.run_id}"
494
+
495
+ def handle_exception(e):
496
+ wandb.termwarn(
497
+ f"Exception when ensuring Kubernetes API key secret: {e}. Retrying..."
498
+ )
499
+
500
+ api_key_secret = await retry_async(
501
+ backoff=ExponentialBackoff(
502
+ initial_sleep=datetime.timedelta(seconds=1),
503
+ max_sleep=datetime.timedelta(minutes=1),
504
+ max_retries=API_KEY_SECRET_MAX_RETRIES,
505
+ ),
506
+ fn=ensure_api_key_secret,
507
+ on_exc=handle_exception,
508
+ core_api=core_api,
509
+ secret_name=secret_name,
510
+ namespace=namespace,
511
+ api_key=value,
512
+ )
513
+ env.append(
514
+ {
515
+ "name": key,
516
+ "valueFrom": {
517
+ "secretKeyRef": {
518
+ "name": secret_name,
519
+ "key": "password",
520
+ }
521
+ },
522
+ }
523
+ )
524
+ else:
525
+ env.append({"name": key, "value": value})
526
+ cont["env"] = env
527
+
528
+ pod_spec["containers"] = containers
529
+ pod_template["spec"] = pod_spec
530
+ job_spec["template"] = pod_template
531
+ job["spec"] = job_spec
532
+ job["metadata"] = job_metadata
533
+
534
+ add_label_to_pods(
535
+ job,
536
+ WANDB_K8S_LABEL_MONITOR,
537
+ "true",
538
+ )
539
+
540
+ if launch_project.job_base_image:
541
+ apply_code_mount_configuration(
542
+ job,
543
+ launch_project,
544
+ )
545
+
546
+ # Add wandb.ai/agent: current agent label on all pods
547
+ if LaunchAgent.initialized():
548
+ add_label_to_pods(
549
+ job,
550
+ WANDB_K8S_LABEL_AGENT,
551
+ LaunchAgent.name(),
552
+ )
553
+
554
+ return job, api_key_secret
555
+
556
+ async def _wait_for_resource_ready(
557
+ self,
558
+ api_client: kubernetes_asyncio.client.ApiClient,
559
+ config: Dict[str, Any],
560
+ namespace: str,
561
+ timeout_seconds: int = 300,
562
+ ) -> None:
563
+ """Wait for a Kubernetes resource to be ready.
564
+
565
+ Arguments:
566
+ api_client: The Kubernetes API client.
567
+ config: The resource configuration.
568
+ namespace: The namespace where the resource was created.
569
+ timeout_seconds: Maximum time to wait for readiness.
570
+ """
571
+ resource_kind = config.get("kind")
572
+ resource_name = config.get("metadata", {}).get("name")
573
+
574
+ if not resource_kind or not resource_name:
575
+ wandb.termerror(
576
+ f"{LOG_PREFIX}Cannot wait for resource without kind or name"
577
+ )
578
+ return
579
+
580
+ wandb.termlog(
581
+ f"{LOG_PREFIX}Waiting for {resource_kind} '{resource_name}' to be ready..."
582
+ )
583
+
584
+ start_time = time.time()
585
+
586
+ if resource_kind == "Deployment":
587
+ await self._wait_for_deployment_ready(
588
+ api_client, resource_name, namespace, timeout_seconds
589
+ )
590
+ elif resource_kind == "Service":
591
+ await self._wait_for_service_ready(
592
+ api_client, resource_name, namespace, timeout_seconds
593
+ )
594
+ elif resource_kind == "Pod":
595
+ await self._wait_for_pod_ready(
596
+ api_client, resource_name, namespace, timeout_seconds
597
+ )
598
+ else:
599
+ wandb.termlog(
600
+ f"{LOG_PREFIX}No specific readiness check for {resource_kind}, waiting 5 seconds..."
601
+ )
602
+ await asyncio.sleep(5)
603
+
604
+ elapsed = time.time() - start_time
605
+ wandb.termlog(
606
+ f"{LOG_PREFIX}{resource_kind} '{resource_name}' is ready after {elapsed:.1f}s"
607
+ )
608
+
609
+ async def _wait_for_deployment_ready(
610
+ self,
611
+ api_client: kubernetes_asyncio.client.ApiClient,
612
+ name: str,
613
+ namespace: str,
614
+ timeout_seconds: int,
615
+ ) -> None:
616
+ """Wait for a Deployment to be ready."""
617
+ apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
618
+
619
+ async def check_deployment_ready():
620
+ deployment = await apps_api.read_namespaced_deployment(
621
+ name=name, namespace=namespace
622
+ )
623
+ status = deployment.status
624
+
625
+ if status.ready_replicas and status.replicas:
626
+ return status.ready_replicas >= status.replicas
627
+
628
+ return False
629
+
630
+ await self._wait_with_timeout(check_deployment_ready, timeout_seconds, name)
631
+
632
+ async def _wait_for_service_ready(
633
+ self,
634
+ api_client: kubernetes_asyncio.client.ApiClient,
635
+ name: str,
636
+ namespace: str,
637
+ timeout_seconds: int,
638
+ ) -> None:
639
+ """Wait for a Service to have endpoints."""
640
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
641
+
642
+ async def check_service_ready():
643
+ endpoints = await core_api.read_namespaced_endpoints(
644
+ name=name, namespace=namespace
645
+ )
646
+ if endpoints.subsets:
647
+ for subset in endpoints.subsets:
648
+ if subset.addresses: # These are ready pod addresses
649
+ return True
650
+ return False
651
+
652
+ await self._wait_with_timeout(check_service_ready, timeout_seconds, name)
653
+
654
+ async def _wait_for_pod_ready(
655
+ self,
656
+ api_client: kubernetes_asyncio.client.ApiClient,
657
+ name: str,
658
+ namespace: str,
659
+ timeout_seconds: int,
660
+ ) -> None:
661
+ """Wait for a Pod to be ready."""
662
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
663
+
664
+ async def check_pod_ready():
665
+ pod = await core_api.read_namespaced_pod(name=name, namespace=namespace)
666
+ if pod.status.phase == "Running":
667
+ if pod.status.container_statuses:
668
+ return all(status.ready for status in pod.status.container_statuses)
669
+ return True
670
+ return False
671
+
672
+ await self._wait_with_timeout(check_pod_ready, timeout_seconds, name)
673
+
674
+ async def _wait_with_timeout(
675
+ self, check_func, timeout_seconds: int, name: str
676
+ ) -> None:
677
+ """Generic timeout wrapper for readiness checks."""
678
+ start_time = time.time()
679
+
680
+ while time.time() - start_time < timeout_seconds:
681
+ try:
682
+ if await check_func():
683
+ return
684
+ except kubernetes_asyncio.client.ApiException as e:
685
+ if e.status == 404:
686
+ pass
687
+ else:
688
+ wandb.termerror(
689
+ f"{LOG_PREFIX}Error waiting for resource '{name}': {e}"
690
+ )
691
+ raise
692
+ except Exception as e:
693
+ wandb.termerror(f"{LOG_PREFIX}Error waiting for resource '{name}': {e}")
694
+ raise
695
+ await asyncio.sleep(2)
696
+
697
+ raise LaunchError(
698
+ f"Resource '{name}' not ready within {timeout_seconds} seconds"
699
+ )
700
+
701
+ async def _prepare_resource(
702
+ self,
703
+ api_client: kubernetes_asyncio.client.ApiClient,
704
+ config: Dict[str, Any],
705
+ namespace: str,
706
+ run_id: str,
707
+ launch_project: LaunchProject,
708
+ api_key_secret: Optional["V1Secret"] = None,
709
+ wait_for_ready: bool = True,
710
+ wait_timeout: int = 300,
711
+ ) -> None:
712
+ """Prepare a service for launch.
713
+
714
+ Arguments:
715
+ api_client: The Kubernetes API client.
716
+ config: The resource configuration to prepare.
717
+ namespace: The namespace to create the resource in.
718
+ run_id: The run ID to label the resource with.
719
+ launch_project: The launch project to get environment variables from.
720
+ api_key_secret: The API key secret to inject.
721
+ wait_for_ready: Whether to wait for the resource to be ready after creation.
722
+ wait_timeout: Maximum time in seconds to wait for resource readiness.
723
+ """
724
+ config.setdefault("metadata", {})
725
+ config["metadata"].setdefault("labels", {})
726
+ config["metadata"]["labels"][WANDB_K8S_RUN_ID] = run_id
727
+ config["metadata"]["labels"]["wandb.ai/created-by"] = "launch-agent"
728
+
729
+ env_vars = launch_project.get_env_vars_dict(
730
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
731
+ )
732
+ wandb_config_env = {
733
+ "WANDB_CONFIG": env_vars.get("WANDB_CONFIG", "{}"),
734
+ }
735
+ add_wandb_env(config, wandb_config_env)
736
+
737
+ if api_key_secret:
738
+ for cont in yield_containers(config):
739
+ env = cont.setdefault("env", [])
740
+ env.append(
741
+ {
742
+ "name": "WANDB_API_KEY",
743
+ "valueFrom": {
744
+ "secretKeyRef": {
745
+ "name": api_key_secret.metadata.name,
746
+ "key": "password",
747
+ }
748
+ },
749
+ }
750
+ )
751
+ cont["env"] = env
752
+
753
+ try:
754
+ await kubernetes_asyncio.utils.create_from_dict(
755
+ api_client, config, namespace=namespace
756
+ )
757
+
758
+ if wait_for_ready:
759
+ await self._wait_for_resource_ready(
760
+ api_client, config, namespace, wait_timeout
761
+ )
762
+ except Exception as e:
763
+ wandb.termerror(f"{LOG_PREFIX}Failed to create Kubernetes resource: {e}")
764
+ raise LaunchError(f"Failed to create Kubernetes resource: {e}")
765
+
766
+ async def run(
767
+ self, launch_project: LaunchProject, image_uri: str
768
+ ) -> Optional[AbstractRun]:
769
+ """Execute a launch project on Kubernetes.
770
+
771
+ Arguments:
772
+ launch_project: The launch project to execute.
773
+ builder: The builder to use to build the image.
774
+
775
+ Returns:
776
+ The run object if the run was successful, otherwise None.
777
+ """
778
+ await LaunchKubernetesMonitor.ensure_initialized()
779
+ resource_args = launch_project.fill_macros(image_uri).get("kubernetes", {})
780
+ if not resource_args:
781
+ wandb.termlog(
782
+ f"{LOG_PREFIX}Note: no resource args specified. Add a "
783
+ "Kubernetes yaml spec or other options in a json file "
784
+ "with --resource-args <json>."
785
+ )
786
+ _logger.info(f"Running Kubernetes job with resource args: {resource_args}")
787
+
788
+ context, api_client = await get_kube_context_and_api_client(
789
+ kubernetes_asyncio, resource_args
790
+ )
791
+
792
+ # If using pvc for code mount, move code there.
793
+ if launch_project.job_base_image is not None:
794
+ if SOURCE_CODE_PVC_NAME is None or SOURCE_CODE_PVC_MOUNT_PATH is None:
795
+ raise LaunchError(
796
+ "WANDB_LAUNCH_SOURCE_CODE_PVC_ environment variables not set. "
797
+ "Unable to mount source code PVC into base image. "
798
+ "Use the `codeMountPvcName` variable in the agent helm chart "
799
+ "to enable base image jobs for this agent. See "
800
+ "https://github.com/wandb/helm-charts/tree/main/charts/launch-agent "
801
+ "for more information."
802
+ )
803
+ code_subdir = launch_project.get_image_source_string()
804
+ launch_project.change_project_dir(
805
+ os.path.join(SOURCE_CODE_PVC_MOUNT_PATH, code_subdir)
806
+ )
807
+
808
+ # If the user specified an alternate api, we need will execute this
809
+ # run by creating a custom object.
810
+ api_version = resource_args.get("apiVersion", "batch/v1")
811
+
812
+ if api_version not in ["batch/v1", "batch/v1beta1"]:
813
+ env_vars = launch_project.get_env_vars_dict(
814
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
815
+ )
816
+ # Crawl the resource args and add our env vars to the containers.
817
+ add_wandb_env(resource_args, env_vars)
818
+
819
+ # Add our labels to the resource args. This is necessary for the
820
+ # agent to find the custom object later on.
821
+ resource_args["metadata"] = resource_args.get("metadata", {})
822
+ resource_args["metadata"]["labels"] = resource_args["metadata"].get(
823
+ "labels", {}
824
+ )
825
+ resource_args["metadata"]["labels"][WANDB_K8S_LABEL_MONITOR] = "true"
826
+
827
+ # Crawl the resource arsg and add our labels to the pods. This is
828
+ # necessary for the agent to find the pods later on.
829
+ add_label_to_pods(
830
+ resource_args,
831
+ WANDB_K8S_LABEL_MONITOR,
832
+ "true",
833
+ )
834
+
835
+ # Add wandb.ai/agent: current agent label on all pods
836
+ if LaunchAgent.initialized():
837
+ add_label_to_pods(
838
+ resource_args,
839
+ WANDB_K8S_LABEL_AGENT,
840
+ LaunchAgent.name(),
841
+ )
842
+ resource_args["metadata"]["labels"][WANDB_K8S_LABEL_AGENT] = (
843
+ LaunchAgent.name()
844
+ )
845
+
846
+ if launch_project.job_base_image:
847
+ apply_code_mount_configuration(resource_args, launch_project)
848
+
849
+ overrides = {}
850
+ if launch_project.override_args:
851
+ overrides["args"] = launch_project.override_args
852
+ if launch_project.override_entrypoint:
853
+ overrides["command"] = launch_project.override_entrypoint.command
854
+ add_entrypoint_args_overrides(
855
+ resource_args,
856
+ overrides,
857
+ )
858
+ api = client.CustomObjectsApi(api_client)
859
+ # Infer the attributes of a custom object from the apiVersion and/or
860
+ # a kind: attribute in the resource args.
861
+ namespace = self.get_namespace(resource_args, context)
862
+ group, version, *_ = api_version.split("/")
863
+ group = resource_args.get("group", group)
864
+ version = resource_args.get("version", version)
865
+ kind = resource_args.get("kind", version)
866
+ plural = f"{kind.lower()}s"
867
+ custom_resource = CustomResource(
868
+ group=group,
869
+ version=version,
870
+ plural=plural,
871
+ )
872
+ LaunchKubernetesMonitor.monitor_namespace(
873
+ namespace, custom_resource=custom_resource
874
+ )
875
+
876
+ try:
877
+ response = await api.create_namespaced_custom_object(
878
+ group=group,
879
+ version=version,
880
+ namespace=namespace,
881
+ plural=plural,
882
+ body=resource_args,
883
+ )
884
+ except ApiException as e:
885
+ body = json.loads(e.body)
886
+ body_yaml = yaml.dump(body)
887
+ raise LaunchError(
888
+ f"Error creating CRD of kind {kind}: {e.status} {e.reason}\n{body_yaml}"
889
+ ) from e
890
+ name = response.get("metadata", {}).get("name")
891
+ _logger.info(f"Created {kind} {response['metadata']['name']}")
892
+ submitted_run = CrdSubmittedRun(
893
+ name=name,
894
+ group=group,
895
+ version=version,
896
+ namespace=namespace,
897
+ plural=plural,
898
+ core_api=client.CoreV1Api(api_client),
899
+ custom_api=api,
900
+ )
901
+ if self.backend_config[PROJECT_SYNCHRONOUS]:
902
+ await submitted_run.wait()
903
+ return submitted_run
904
+
905
+ batch_api = kubernetes_asyncio.client.BatchV1Api(api_client)
906
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
907
+ apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
908
+ network_api = kubernetes_asyncio.client.NetworkingV1Api(api_client)
909
+
910
+ namespace = self.get_namespace(resource_args, context)
911
+ job, secret = await self._inject_defaults(
912
+ resource_args, launch_project, image_uri, namespace, core_api
913
+ )
914
+
915
+ update_dict = {
916
+ "project_name": launch_project.target_project,
917
+ "entity_name": launch_project.target_entity,
918
+ "run_id": launch_project.run_id,
919
+ "run_name": launch_project.name,
920
+ "image_uri": image_uri,
921
+ "author": launch_project.author,
922
+ }
923
+ update_dict.update(os.environ)
924
+ additional_services: List[Dict[str, Any]] = recursive_macro_sub(
925
+ launch_project.launch_spec.get("additional_services", []), update_dict
926
+ )
927
+ if additional_services:
928
+ wandb.termlog(
929
+ f"{LOG_PREFIX}Creating additional services: {additional_services}"
930
+ )
931
+
932
+ wait_for_ready = resource_args.get("wait_for_ready", True)
933
+ wait_timeout = resource_args.get("wait_timeout", 300)
934
+
935
+ await asyncio.gather(
936
+ *[
937
+ self._prepare_resource(
938
+ api_client,
939
+ resource.get("config", {}),
940
+ namespace,
941
+ launch_project.run_id,
942
+ launch_project,
943
+ secret,
944
+ wait_for_ready,
945
+ wait_timeout,
946
+ )
947
+ for resource in additional_services
948
+ if resource.get("config", {})
949
+ ]
950
+ )
951
+
952
+ msg = "Creating Kubernetes job"
953
+ if "name" in resource_args:
954
+ msg += f": {resource_args['name']}"
955
+ _logger.info(msg)
956
+ try:
957
+ response = await kubernetes_asyncio.utils.create_from_dict(
958
+ api_client, job, namespace=namespace
959
+ )
960
+ except kubernetes_asyncio.utils.FailToCreateError as e:
961
+ for exc in e.api_exceptions:
962
+ resp = json.loads(exc.body)
963
+ msg = resp.get("message")
964
+ code = resp.get("code")
965
+ raise LaunchError(
966
+ f"Failed to create Kubernetes job for run {launch_project.run_id} ({code} {exc.reason}): {msg}"
967
+ )
968
+ except Exception as e:
969
+ raise LaunchError(
970
+ f"Unexpected exception when creating Kubernetes job: {str(e)}\n"
971
+ )
972
+ job_response = response[0]
973
+ job_name = job_response.metadata.name
974
+ LaunchKubernetesMonitor.monitor_namespace(namespace)
975
+ submitted_job = KubernetesSubmittedRun(
976
+ batch_api,
977
+ core_api,
978
+ apps_api,
979
+ network_api,
980
+ job_name,
981
+ namespace,
982
+ secret,
983
+ f"aux-{launch_project.target_entity}-{launch_project.target_project}-{launch_project.run_id}",
984
+ )
985
+ if self.backend_config[PROJECT_SYNCHRONOUS]:
986
+ await submitted_job.wait()
987
+
988
+ return submitted_job
989
+
990
+
991
+ def inject_entrypoint_and_args(
992
+ containers: List[dict],
993
+ entry_point: Optional[EntryPoint],
994
+ override_args: List[str],
995
+ should_override_entrypoint: bool,
996
+ ) -> None:
997
+ """Inject the entrypoint and args into the containers.
998
+
999
+ Arguments:
1000
+ containers: The containers to inject the entrypoint and args into.
1001
+ entry_point: The entrypoint to inject.
1002
+ override_args: The args to inject.
1003
+ should_override_entrypoint: Whether to override the entrypoint.
1004
+
1005
+ Returns:
1006
+ None
1007
+ """
1008
+ for i in range(len(containers)):
1009
+ if override_args:
1010
+ containers[i]["args"] = override_args
1011
+ if entry_point and (
1012
+ not containers[i].get("command") or should_override_entrypoint
1013
+ ):
1014
+ containers[i]["command"] = entry_point.command
1015
+
1016
+
1017
+ async def ensure_api_key_secret(
1018
+ core_api: "CoreV1Api",
1019
+ secret_name: str,
1020
+ namespace: str,
1021
+ api_key: str,
1022
+ ) -> "V1Secret":
1023
+ """Create a secret containing a user's wandb API key.
1024
+
1025
+ Arguments:
1026
+ core_api: The Kubernetes CoreV1Api object.
1027
+ secret_name: The name to use for the secret.
1028
+ namespace: The namespace to create the secret in.
1029
+ api_key: The user's wandb API key
1030
+
1031
+ Returns:
1032
+ The created secret
1033
+ """
1034
+ secret_data = {"password": base64.b64encode(api_key.encode()).decode()}
1035
+ labels = {"wandb.ai/created-by": "launch-agent"}
1036
+ secret = client.V1Secret(
1037
+ data=secret_data,
1038
+ metadata=client.V1ObjectMeta(
1039
+ name=secret_name, namespace=namespace, labels=labels
1040
+ ),
1041
+ kind="Secret",
1042
+ type="kubernetes.io/basic-auth",
1043
+ )
1044
+
1045
+ try:
1046
+ try:
1047
+ return await core_api.create_namespaced_secret(namespace, secret)
1048
+ except ApiException as e:
1049
+ # 409 = conflict = secret already exists
1050
+ if e.status == 409:
1051
+ existing_secret = await core_api.read_namespaced_secret(
1052
+ name=secret_name, namespace=namespace
1053
+ )
1054
+ if existing_secret.data != secret_data:
1055
+ # If it's a previous secret made by launch agent, clean it up
1056
+ if (
1057
+ existing_secret.metadata.labels.get("wandb.ai/created-by")
1058
+ == "launch-agent"
1059
+ ):
1060
+ await core_api.delete_namespaced_secret(
1061
+ name=secret_name, namespace=namespace
1062
+ )
1063
+ return await core_api.create_namespaced_secret(
1064
+ namespace, secret
1065
+ )
1066
+ else:
1067
+ raise LaunchError(
1068
+ f"Kubernetes secret already exists in namespace {namespace} with incorrect data: {secret_name}"
1069
+ )
1070
+ return existing_secret
1071
+ raise
1072
+ except Exception as e:
1073
+ raise LaunchError(
1074
+ f"Exception when ensuring Kubernetes API key secret: {str(e)}\n"
1075
+ )
1076
+
1077
+
1078
+ async def maybe_create_imagepull_secret(
1079
+ core_api: "CoreV1Api",
1080
+ registry: AbstractRegistry,
1081
+ run_id: str,
1082
+ namespace: str,
1083
+ ) -> Optional["V1Secret"]:
1084
+ """Create a secret for pulling images from a private registry.
1085
+
1086
+ Arguments:
1087
+ core_api: The Kubernetes CoreV1Api object.
1088
+ registry: The registry to pull from.
1089
+ run_id: The run id.
1090
+ namespace: The namespace to create the secret in.
1091
+
1092
+ Returns:
1093
+ A secret if one was created, otherwise None.
1094
+ """
1095
+ secret = None
1096
+ if isinstance(registry, LocalRegistry) or isinstance(
1097
+ registry, AzureContainerRegistry
1098
+ ):
1099
+ # Secret not required
1100
+ return None
1101
+ uname, token = await registry.get_username_password()
1102
+ creds_info = {
1103
+ "auths": {
1104
+ registry.uri: {
1105
+ "auth": base64.b64encode(f"{uname}:{token}".encode()).decode(),
1106
+ # need an email but the use is deprecated
1107
+ "email": "deprecated@wandblaunch.com",
1108
+ }
1109
+ }
1110
+ }
1111
+ secret_data = {
1112
+ ".dockerconfigjson": base64.b64encode(json.dumps(creds_info).encode()).decode()
1113
+ }
1114
+ secret = client.V1Secret(
1115
+ data=secret_data,
1116
+ metadata=client.V1ObjectMeta(name=f"regcred-{run_id}", namespace=namespace),
1117
+ kind="Secret",
1118
+ type="kubernetes.io/dockerconfigjson",
1119
+ )
1120
+ try:
1121
+ try:
1122
+ return await core_api.create_namespaced_secret(namespace, secret)
1123
+ except ApiException as e:
1124
+ # 409 = conflict = secret already exists
1125
+ if e.status == 409:
1126
+ return await core_api.read_namespaced_secret(
1127
+ name=f"regcred-{run_id}", namespace=namespace
1128
+ )
1129
+ raise
1130
+ except Exception as e:
1131
+ raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")
1132
+
1133
+
1134
+ def yield_containers(root: Any) -> Iterator[dict]:
1135
+ """Yield all container specs in a manifest.
1136
+
1137
+ Recursively traverses the manifest and yields all container specs. Container
1138
+ specs are identified by the presence of a "containers" key in the value.
1139
+ """
1140
+ if isinstance(root, dict):
1141
+ for k, v in root.items():
1142
+ if k == "containers":
1143
+ if isinstance(v, list):
1144
+ yield from v
1145
+ elif isinstance(v, (dict, list)):
1146
+ yield from yield_containers(v)
1147
+ elif isinstance(root, list):
1148
+ for item in root:
1149
+ yield from yield_containers(item)
1150
+
1151
+
1152
+ def add_wandb_env(root: Union[dict, list], env_vars: Dict[str, str]) -> None:
1153
+ """Injects wandb environment variables into specs.
1154
+
1155
+ Recursively walks the spec and injects the environment variables into
1156
+ every container spec. Containers are identified by the "containers" key.
1157
+
1158
+ This function treats the WANDB_RUN_ID and WANDB_GROUP_ID environment variables
1159
+ specially. If they are present in the spec, they will be overwritten. If a setting
1160
+ for WANDB_RUN_ID is provided in env_vars, then that environment variable will only be
1161
+ set in the first container modified by this function.
1162
+
1163
+ Arguments:
1164
+ root: The spec to modify.
1165
+ env_vars: The environment variables to inject.
1166
+
1167
+ Returns: None.
1168
+ """
1169
+ for cont in yield_containers(root):
1170
+ env = cont.setdefault("env", [])
1171
+ env.extend([{"name": key, "value": value} for key, value in env_vars.items()])
1172
+ cont["env"] = env
1173
+ # After we have set WANDB_RUN_ID once, we don't want to set it again
1174
+ if "WANDB_RUN_ID" in env_vars:
1175
+ env_vars.pop("WANDB_RUN_ID")
1176
+
1177
+
1178
+ def yield_pods(manifest: Any) -> Iterator[dict]:
1179
+ """Yield all pod specs in a manifest.
1180
+
1181
+ Recursively traverses the manifest and yields all pod specs. Pod specs are
1182
+ identified by the presence of a "spec" key with a "containers" key in the
1183
+ value.
1184
+ """
1185
+ if isinstance(manifest, list):
1186
+ for item in manifest:
1187
+ yield from yield_pods(item)
1188
+ elif isinstance(manifest, dict):
1189
+ if "spec" in manifest and "containers" in manifest["spec"]:
1190
+ yield manifest
1191
+ for value in manifest.values():
1192
+ if isinstance(value, (dict, list)):
1193
+ yield from yield_pods(value)
1194
+
1195
+
1196
+ def add_label_to_pods(
1197
+ manifest: Union[dict, list], label_key: str, label_value: str
1198
+ ) -> None:
1199
+ """Add a label to all pod specs in a manifest.
1200
+
1201
+ Recursively traverses the manifest and adds the label to all pod specs.
1202
+ Pod specs are identified by the presence of a "spec" key with a "containers"
1203
+ key in the value.
1204
+
1205
+ Arguments:
1206
+ manifest: The manifest to modify.
1207
+ label_key: The label key to add.
1208
+ label_value: The label value to add.
1209
+
1210
+ Returns: None.
1211
+ """
1212
+ for pod in yield_pods(manifest):
1213
+ metadata = pod.setdefault("metadata", {})
1214
+ labels = metadata.setdefault("labels", {})
1215
+ labels[label_key] = label_value
1216
+
1217
+
1218
+ def add_entrypoint_args_overrides(manifest: Union[dict, list], overrides: dict) -> None:
1219
+ """Add entrypoint and args overrides to all containers in a manifest.
1220
+
1221
+ Recursively traverses the manifest and adds the entrypoint and args overrides
1222
+ to all containers. Containers are identified by the presence of a "spec" key
1223
+ with a "containers" key in the value.
1224
+
1225
+ Arguments:
1226
+ manifest: The manifest to modify.
1227
+ overrides: Dictionary with args and entrypoint keys.
1228
+
1229
+ Returns: None.
1230
+ """
1231
+ if isinstance(manifest, list):
1232
+ for item in manifest:
1233
+ add_entrypoint_args_overrides(item, overrides)
1234
+ elif isinstance(manifest, dict):
1235
+ if "spec" in manifest and "containers" in manifest["spec"]:
1236
+ containers = manifest["spec"]["containers"]
1237
+ for container in containers:
1238
+ if "command" in overrides:
1239
+ container["command"] = overrides["command"]
1240
+ if "args" in overrides:
1241
+ container["args"] = overrides["args"]
1242
+ for value in manifest.values():
1243
+ add_entrypoint_args_overrides(value, overrides)
1244
+
1245
+
1246
+ def apply_code_mount_configuration(
1247
+ manifest: Union[Dict, list], project: LaunchProject
1248
+ ) -> None:
1249
+ """Apply code mount configuration to all containers in a manifest.
1250
+
1251
+ Recursively traverses the manifest and adds the code mount configuration to
1252
+ all containers. Containers are identified by the presence of a "spec" key
1253
+ with a "containers" key in the value.
1254
+
1255
+ Arguments:
1256
+ manifest: The manifest to modify.
1257
+ project: The launch project.
1258
+
1259
+ Returns: None.
1260
+ """
1261
+ assert SOURCE_CODE_PVC_NAME is not None
1262
+ source_dir = project.get_image_source_string()
1263
+ for pod in yield_pods(manifest):
1264
+ for container in yield_containers(pod):
1265
+ if "volumeMounts" not in container:
1266
+ container["volumeMounts"] = []
1267
+ container["volumeMounts"].append(
1268
+ {
1269
+ "name": "wandb-source-code-volume",
1270
+ "mountPath": CODE_MOUNT_DIR,
1271
+ "subPath": source_dir,
1272
+ }
1273
+ )
1274
+ container["workingDir"] = CODE_MOUNT_DIR
1275
+ spec = pod["spec"]
1276
+ if "volumes" not in spec:
1277
+ spec["volumes"] = []
1278
+ spec["volumes"].append(
1279
+ {
1280
+ "name": "wandb-source-code-volume",
1281
+ "persistentVolumeClaim": {
1282
+ "claimName": SOURCE_CODE_PVC_NAME,
1283
+ },
1284
+ }
1285
+ )