wandb 0.16.5__py3-none-any.whl → 0.17.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (194) hide show
  1. package_readme.md +95 -0
  2. wandb/__init__.py +2 -3
  3. wandb/agents/pyagent.py +0 -1
  4. wandb/analytics/sentry.py +2 -1
  5. wandb/apis/importers/internals/internal.py +0 -1
  6. wandb/apis/importers/internals/protocols.py +30 -56
  7. wandb/apis/importers/mlflow.py +13 -26
  8. wandb/apis/importers/wandb.py +8 -14
  9. wandb/apis/internal.py +0 -3
  10. wandb/apis/public/api.py +55 -3
  11. wandb/apis/public/artifacts.py +1 -0
  12. wandb/apis/public/files.py +1 -0
  13. wandb/apis/public/history.py +1 -0
  14. wandb/apis/public/jobs.py +17 -4
  15. wandb/apis/public/projects.py +1 -0
  16. wandb/apis/public/reports.py +1 -0
  17. wandb/apis/public/runs.py +15 -17
  18. wandb/apis/public/sweeps.py +1 -0
  19. wandb/apis/public/teams.py +1 -0
  20. wandb/apis/public/users.py +1 -0
  21. wandb/apis/reports/v1/_blocks.py +3 -7
  22. wandb/apis/reports/v2/gql.py +1 -0
  23. wandb/apis/reports/v2/interface.py +3 -4
  24. wandb/apis/reports/v2/internal.py +5 -8
  25. wandb/cli/cli.py +95 -22
  26. wandb/data_types.py +9 -6
  27. wandb/docker/__init__.py +1 -1
  28. wandb/env.py +38 -8
  29. wandb/errors/__init__.py +5 -0
  30. wandb/errors/term.py +10 -2
  31. wandb/filesync/step_checksum.py +1 -4
  32. wandb/filesync/step_prepare.py +4 -24
  33. wandb/filesync/step_upload.py +4 -106
  34. wandb/filesync/upload_job.py +0 -76
  35. wandb/integration/catboost/catboost.py +1 -1
  36. wandb/integration/fastai/__init__.py +1 -0
  37. wandb/integration/huggingface/resolver.py +2 -2
  38. wandb/integration/keras/__init__.py +1 -0
  39. wandb/integration/keras/callbacks/metrics_logger.py +1 -1
  40. wandb/integration/keras/keras.py +7 -7
  41. wandb/integration/langchain/wandb_tracer.py +1 -0
  42. wandb/integration/lightning/fabric/logger.py +1 -3
  43. wandb/integration/metaflow/metaflow.py +41 -6
  44. wandb/integration/openai/fine_tuning.py +77 -40
  45. wandb/integration/prodigy/prodigy.py +1 -1
  46. wandb/old/summary.py +1 -1
  47. wandb/plot/confusion_matrix.py +1 -1
  48. wandb/plot/pr_curve.py +2 -1
  49. wandb/plot/roc_curve.py +2 -1
  50. wandb/{plots → plot}/utils.py +13 -25
  51. wandb/proto/v3/wandb_internal_pb2.py +364 -332
  52. wandb/proto/v3/wandb_settings_pb2.py +2 -2
  53. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  54. wandb/proto/v4/wandb_internal_pb2.py +322 -316
  55. wandb/proto/v4/wandb_settings_pb2.py +2 -2
  56. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  57. wandb/proto/wandb_deprecated.py +7 -1
  58. wandb/proto/wandb_internal_codegen.py +3 -29
  59. wandb/sdk/artifacts/artifact.py +51 -20
  60. wandb/sdk/artifacts/artifact_download_logger.py +1 -0
  61. wandb/sdk/artifacts/artifact_file_cache.py +18 -4
  62. wandb/sdk/artifacts/artifact_instance_cache.py +1 -0
  63. wandb/sdk/artifacts/artifact_manifest.py +1 -0
  64. wandb/sdk/artifacts/artifact_manifest_entry.py +7 -3
  65. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
  66. wandb/sdk/artifacts/artifact_saver.py +18 -27
  67. wandb/sdk/artifacts/artifact_state.py +1 -0
  68. wandb/sdk/artifacts/artifact_ttl.py +1 -0
  69. wandb/sdk/artifacts/exceptions.py +1 -0
  70. wandb/sdk/artifacts/storage_handlers/azure_handler.py +1 -0
  71. wandb/sdk/artifacts/storage_handlers/gcs_handler.py +13 -18
  72. wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -0
  73. wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -0
  74. wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -0
  75. wandb/sdk/artifacts/storage_handlers/s3_handler.py +5 -3
  76. wandb/sdk/artifacts/storage_handlers/tracking_handler.py +1 -0
  77. wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -0
  78. wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -0
  79. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +3 -42
  80. wandb/sdk/artifacts/storage_policy.py +2 -12
  81. wandb/sdk/data_types/_dtypes.py +8 -8
  82. wandb/sdk/data_types/base_types/media.py +3 -6
  83. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
  84. wandb/sdk/data_types/image.py +1 -1
  85. wandb/sdk/data_types/video.py +1 -1
  86. wandb/sdk/integration_utils/auto_logging.py +5 -6
  87. wandb/sdk/integration_utils/data_logging.py +10 -6
  88. wandb/sdk/interface/interface.py +86 -38
  89. wandb/sdk/interface/interface_shared.py +7 -13
  90. wandb/sdk/internal/datastore.py +1 -1
  91. wandb/sdk/internal/file_pusher.py +2 -5
  92. wandb/sdk/internal/file_stream.py +5 -18
  93. wandb/sdk/internal/handler.py +18 -2
  94. wandb/sdk/internal/internal.py +0 -1
  95. wandb/sdk/internal/internal_api.py +1 -129
  96. wandb/sdk/internal/internal_util.py +0 -1
  97. wandb/sdk/internal/job_builder.py +159 -45
  98. wandb/sdk/internal/profiler.py +1 -0
  99. wandb/sdk/internal/progress.py +0 -28
  100. wandb/sdk/internal/run.py +1 -0
  101. wandb/sdk/internal/sender.py +1 -2
  102. wandb/sdk/internal/system/assets/gpu_amd.py +44 -44
  103. wandb/sdk/internal/system/assets/gpu_apple.py +56 -11
  104. wandb/sdk/internal/system/assets/interfaces.py +6 -8
  105. wandb/sdk/internal/system/assets/open_metrics.py +2 -2
  106. wandb/sdk/internal/system/assets/trainium.py +1 -3
  107. wandb/sdk/launch/__init__.py +9 -1
  108. wandb/sdk/launch/_launch.py +9 -24
  109. wandb/sdk/launch/_launch_add.py +1 -3
  110. wandb/sdk/launch/_project_spec.py +188 -241
  111. wandb/sdk/launch/agent/agent.py +115 -48
  112. wandb/sdk/launch/agent/config.py +80 -14
  113. wandb/sdk/launch/builder/abstract.py +69 -1
  114. wandb/sdk/launch/builder/build.py +156 -555
  115. wandb/sdk/launch/builder/context_manager.py +235 -0
  116. wandb/sdk/launch/builder/docker_builder.py +8 -23
  117. wandb/sdk/launch/builder/kaniko_builder.py +161 -159
  118. wandb/sdk/launch/builder/noop.py +1 -0
  119. wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
  120. wandb/sdk/launch/create_job.py +68 -63
  121. wandb/sdk/launch/environment/abstract.py +1 -0
  122. wandb/sdk/launch/environment/gcp_environment.py +1 -0
  123. wandb/sdk/launch/environment/local_environment.py +1 -0
  124. wandb/sdk/launch/inputs/files.py +148 -0
  125. wandb/sdk/launch/inputs/internal.py +217 -0
  126. wandb/sdk/launch/inputs/manage.py +95 -0
  127. wandb/sdk/launch/loader.py +1 -0
  128. wandb/sdk/launch/registry/abstract.py +1 -0
  129. wandb/sdk/launch/registry/azure_container_registry.py +1 -0
  130. wandb/sdk/launch/registry/elastic_container_registry.py +1 -0
  131. wandb/sdk/launch/registry/google_artifact_registry.py +2 -1
  132. wandb/sdk/launch/registry/local_registry.py +1 -0
  133. wandb/sdk/launch/runner/abstract.py +1 -0
  134. wandb/sdk/launch/runner/kubernetes_monitor.py +4 -1
  135. wandb/sdk/launch/runner/kubernetes_runner.py +9 -10
  136. wandb/sdk/launch/runner/local_container.py +2 -3
  137. wandb/sdk/launch/runner/local_process.py +8 -29
  138. wandb/sdk/launch/runner/sagemaker_runner.py +21 -20
  139. wandb/sdk/launch/runner/vertex_runner.py +8 -7
  140. wandb/sdk/launch/sweeps/scheduler.py +7 -4
  141. wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
  142. wandb/sdk/launch/sweeps/utils.py +3 -3
  143. wandb/sdk/launch/utils.py +33 -140
  144. wandb/sdk/lib/_settings_toposort_generated.py +1 -5
  145. wandb/sdk/lib/fsm.py +8 -12
  146. wandb/sdk/lib/gitlib.py +4 -4
  147. wandb/sdk/lib/import_hooks.py +1 -1
  148. wandb/sdk/lib/lazyloader.py +0 -1
  149. wandb/sdk/lib/proto_util.py +23 -2
  150. wandb/sdk/lib/redirect.py +19 -14
  151. wandb/sdk/lib/retry.py +3 -2
  152. wandb/sdk/lib/run_moment.py +7 -1
  153. wandb/sdk/lib/tracelog.py +1 -1
  154. wandb/sdk/service/service.py +19 -16
  155. wandb/sdk/verify/verify.py +2 -1
  156. wandb/sdk/wandb_init.py +16 -63
  157. wandb/sdk/wandb_manager.py +2 -2
  158. wandb/sdk/wandb_require.py +5 -0
  159. wandb/sdk/wandb_run.py +164 -90
  160. wandb/sdk/wandb_settings.py +2 -48
  161. wandb/sdk/wandb_setup.py +1 -1
  162. wandb/sklearn/__init__.py +1 -0
  163. wandb/sklearn/plot/__init__.py +1 -0
  164. wandb/sklearn/plot/classifier.py +11 -12
  165. wandb/sklearn/plot/clusterer.py +2 -1
  166. wandb/sklearn/plot/regressor.py +1 -0
  167. wandb/sklearn/plot/shared.py +1 -0
  168. wandb/sklearn/utils.py +1 -0
  169. wandb/testing/relay.py +4 -4
  170. wandb/trigger.py +1 -0
  171. wandb/util.py +67 -54
  172. wandb/wandb_controller.py +2 -3
  173. wandb/wandb_torch.py +1 -2
  174. {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/METADATA +67 -70
  175. {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/RECORD +178 -188
  176. {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/WHEEL +1 -2
  177. wandb/bin/apple_gpu_stats +0 -0
  178. wandb/catboost/__init__.py +0 -9
  179. wandb/fastai/__init__.py +0 -9
  180. wandb/keras/__init__.py +0 -18
  181. wandb/lightgbm/__init__.py +0 -9
  182. wandb/plots/__init__.py +0 -6
  183. wandb/plots/explain_text.py +0 -36
  184. wandb/plots/heatmap.py +0 -81
  185. wandb/plots/named_entity.py +0 -43
  186. wandb/plots/part_of_speech.py +0 -50
  187. wandb/plots/plot_definitions.py +0 -768
  188. wandb/plots/precision_recall.py +0 -121
  189. wandb/plots/roc.py +0 -103
  190. wandb/sacred/__init__.py +0 -3
  191. wandb/xgboost/__init__.py +0 -9
  192. wandb-0.16.5.dist-info/top_level.txt +0 -1
  193. {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/entry_points.txt +0 -0
  194. {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  """Implementation of launch agent."""
2
+
2
3
  import asyncio
3
4
  import logging
4
5
  import os
@@ -8,7 +9,9 @@ import time
8
9
  import traceback
9
10
  from dataclasses import dataclass
10
11
  from multiprocessing import Event
11
- from typing import Any, Dict, List, Optional, Union
12
+ from typing import Any, Dict, List, Optional, Tuple, Union
13
+
14
+ import yaml
12
15
 
13
16
  import wandb
14
17
  from wandb.apis.internal import Api
@@ -17,11 +20,11 @@ from wandb.sdk.launch._launch_add import launch_add
17
20
  from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
18
21
  from wandb.sdk.launch.runner.local_process import LocalProcessRunner
19
22
  from wandb.sdk.launch.sweeps.scheduler import Scheduler
23
+ from wandb.sdk.launch.utils import LAUNCH_CONFIG_FILE, resolve_build_and_registry_config
20
24
  from wandb.sdk.lib import runid
21
25
 
22
26
  from .. import loader
23
27
  from .._project_spec import LaunchProject
24
- from ..builder.build import construct_agent_configs
25
28
  from ..errors import LaunchDockerError, LaunchError
26
29
  from ..utils import (
27
30
  LAUNCH_DEFAULT_PROJECT,
@@ -45,7 +48,10 @@ MAX_RESUME_COUNT = 5
45
48
 
46
49
  RUN_INFO_GRACE_PERIOD = 60
47
50
 
48
- MAX_WAIT_RUN_STOPPED = 60
51
+ DEFAULT_STOPPED_RUN_TIMEOUT = 60
52
+
53
+ DEFAULT_PRINT_INTERVAL = 5 * 60
54
+ VERBOSE_PRINT_INTERVAL = 20
49
55
 
50
56
  _env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
51
57
  if _env_timeout:
@@ -105,30 +111,54 @@ def _max_from_config(
105
111
  return max_from_config
106
112
 
107
113
 
108
- def _is_scheduler_job(run_spec: Dict[str, Any]) -> bool:
109
- """Determine whether a job/runSpec is a sweep scheduler."""
110
- if not run_spec:
111
- _logger.debug("Recieved runSpec in _is_scheduler_job that was empty")
114
+ class InternalAgentLogger:
115
+ def __init__(self, verbosity=0):
116
+ self._print_to_terminal = verbosity >= 2
112
117
 
113
- if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
114
- return False
118
+ def error(self, message: str):
119
+ if self._print_to_terminal:
120
+ wandb.termerror(f"{LOG_PREFIX}{message}")
121
+ _logger.error(f"{LOG_PREFIX}{message}")
115
122
 
116
- if run_spec.get("resource") == "local-process":
117
- # Any job pushed to a run queue that has a scheduler uri is
118
- # allowed to use local-process
119
- if run_spec.get("job"):
120
- return True
123
+ def warn(self, message: str):
124
+ if self._print_to_terminal:
125
+ wandb.termwarn(f"{LOG_PREFIX}{message}")
126
+ _logger.warn(f"{LOG_PREFIX}{message}")
121
127
 
122
- # If a scheduler is local-process and run through CLI, also
123
- # confirm command is in format: [wandb scheduler <sweep>]
124
- cmd = run_spec.get("overrides", {}).get("entry_point", [])
125
- if len(cmd) < 3:
126
- return False
128
+ def info(self, message: str):
129
+ if self._print_to_terminal:
130
+ wandb.termlog(f"{LOG_PREFIX}{message}")
131
+ _logger.info(f"{LOG_PREFIX}{message}")
127
132
 
128
- if cmd[:2] != ["wandb", "scheduler"]:
129
- return False
133
+ def debug(self, message: str):
134
+ if self._print_to_terminal:
135
+ wandb.termlog(f"{LOG_PREFIX}{message}")
136
+ _logger.debug(f"{LOG_PREFIX}{message}")
130
137
 
131
- return True
138
+
139
+ def construct_agent_configs(
140
+ launch_config: Optional[Dict] = None,
141
+ build_config: Optional[Dict] = None,
142
+ ) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any], Dict[str, Any]]:
143
+ registry_config = None
144
+ environment_config = None
145
+ if launch_config is not None:
146
+ build_config = launch_config.get("builder")
147
+ registry_config = launch_config.get("registry")
148
+
149
+ default_launch_config = None
150
+ if os.path.exists(os.path.expanduser(LAUNCH_CONFIG_FILE)):
151
+ with open(os.path.expanduser(LAUNCH_CONFIG_FILE)) as f:
152
+ default_launch_config = (
153
+ yaml.safe_load(f) or {}
154
+ ) # In case the config is empty, we want it to be {} instead of None.
155
+ environment_config = default_launch_config.get("environment")
156
+
157
+ build_config, registry_config = resolve_build_and_registry_config(
158
+ default_launch_config, build_config, registry_config
159
+ )
160
+
161
+ return environment_config, build_config, registry_config
132
162
 
133
163
 
134
164
  class LaunchAgent:
@@ -170,7 +200,7 @@ class LaunchAgent:
170
200
  config: Config dictionary for the agent.
171
201
  """
172
202
  self._entity = config["entity"]
173
- self._project = config.get("project", LAUNCH_DEFAULT_PROJECT)
203
+ self._project = LAUNCH_DEFAULT_PROJECT
174
204
  self._api = api
175
205
  self._base_url = self._api.settings().get("base_url")
176
206
  self._ticks = 0
@@ -184,7 +214,13 @@ class LaunchAgent:
184
214
  self._max_jobs = _max_from_config(config, "max_jobs")
185
215
  self._max_schedulers = _max_from_config(config, "max_schedulers")
186
216
  self._secure_mode = config.get("secure_mode", False)
217
+ self._verbosity = config.get("verbosity", 0)
218
+ self._internal_logger = InternalAgentLogger(verbosity=self._verbosity)
219
+ self._last_status_print_time = 0.0
187
220
  self.default_config: Dict[str, Any] = config
221
+ self._stopped_run_timeout = config.get(
222
+ "stopped_run_timeout", DEFAULT_STOPPED_RUN_TIMEOUT
223
+ )
188
224
 
189
225
  # Get agent version from env var if present, otherwise wandb version
190
226
  self.version: str = "wandb@" + wandb.__version__
@@ -228,6 +264,33 @@ class LaunchAgent:
228
264
  self._name = agent_response["name"]
229
265
  self._init_agent_run()
230
266
 
267
+ def _is_scheduler_job(self, run_spec: Dict[str, Any]) -> bool:
268
+ """Determine whether a job/runSpec is a sweep scheduler."""
269
+ if not run_spec:
270
+ self._internal_logger.debug(
271
+ "Received runSpec in _is_scheduler_job that was empty"
272
+ )
273
+
274
+ if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
275
+ return False
276
+
277
+ if run_spec.get("resource") == "local-process":
278
+ # Any job pushed to a run queue that has a scheduler uri is
279
+ # allowed to use local-process
280
+ if run_spec.get("job"):
281
+ return True
282
+
283
+ # If a scheduler is local-process and run through CLI, also
284
+ # confirm command is in format: [wandb scheduler <sweep>]
285
+ cmd = run_spec.get("overrides", {}).get("entry_point", [])
286
+ if len(cmd) < 3:
287
+ return False
288
+
289
+ if cmd[:2] != ["wandb", "scheduler"]:
290
+ return False
291
+
292
+ return True
293
+
231
294
  async def fail_run_queue_item(
232
295
  self,
233
296
  run_queue_item_id: str,
@@ -241,6 +304,8 @@ class LaunchAgent:
241
304
 
242
305
  def _init_agent_run(self) -> None:
243
306
  # TODO: has it been long enough that all backends support agents?
307
+ self._wandb_run = None
308
+
244
309
  if self.gorilla_supports_agents:
245
310
  settings = wandb.Settings(silent=True, disable_git=True)
246
311
  self._wandb_run = wandb.init(
@@ -250,8 +315,6 @@ class LaunchAgent:
250
315
  id=self._name,
251
316
  job_type=HIDDEN_AGENT_RUN_TYPE,
252
317
  )
253
- else:
254
- self._wandb_run = None
255
318
 
256
319
  @property
257
320
  def thread_ids(self) -> List[int]:
@@ -298,14 +361,12 @@ class LaunchAgent:
298
361
 
299
362
  def print_status(self) -> None:
300
363
  """Prints the current status of the agent."""
364
+ self._last_status_print_time = time.time()
301
365
  output_str = "agent "
302
366
  if self._name:
303
367
  output_str += f"{self._name} "
304
368
  if self.num_running_jobs < self._max_jobs:
305
- output_str += "polling on "
306
- if self._project != LAUNCH_DEFAULT_PROJECT:
307
- output_str += f"project {self._project}, "
308
- output_str += f"queues {','.join(self._queues)}, "
369
+ output_str += f"polling on queues {','.join(self._queues)}, "
309
370
  output_str += (
310
371
  f"running {self.num_running_jobs} out of a maximum of {self._max_jobs} jobs"
311
372
  )
@@ -344,8 +405,8 @@ class LaunchAgent:
344
405
  if run_state.lower() != "pending":
345
406
  return True
346
407
  except CommError:
347
- _logger.info(
348
- f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run"
408
+ self._internal_logger.info(
409
+ f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run",
349
410
  )
350
411
  return False
351
412
 
@@ -361,8 +422,8 @@ class LaunchAgent:
361
422
  job_and_run_status.entity is not None
362
423
  and job_and_run_status.entity != self._entity
363
424
  ):
364
- _logger.info(
365
- "Skipping check for completed run status because run is on a different entity than agent"
425
+ self._internal_logger.info(
426
+ "Skipping check for completed run status because run is on a different entity than agent",
366
427
  )
367
428
  elif exception is not None:
368
429
  tb_str = traceback.format_exception(
@@ -378,8 +439,8 @@ class LaunchAgent:
378
439
  fnames,
379
440
  )
380
441
  elif job_and_run_status.project is None or job_and_run_status.run_id is None:
381
- _logger.error(
382
- f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}"
442
+ self._internal_logger.info(
443
+ f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}",
383
444
  )
384
445
  wandb.termerror(
385
446
  "Missing project or run id on thread called finish thread id"
@@ -397,7 +458,6 @@ class LaunchAgent:
397
458
  # We retry for 60 seconds with an exponential backoff in case
398
459
  # upsert run is taking a while.
399
460
  logs = None
400
- start_time = time.time()
401
461
  interval = 1
402
462
  while True:
403
463
  called_init = self._check_run_exists_and_inited(
@@ -406,7 +466,7 @@ class LaunchAgent:
406
466
  job_and_run_status.run_id,
407
467
  job_and_run_status.run_queue_item_id,
408
468
  )
409
- if called_init or time.time() - start_time > RUN_INFO_GRACE_PERIOD:
469
+ if called_init or interval > RUN_INFO_GRACE_PERIOD:
410
470
  break
411
471
  if not called_init:
412
472
  # Fetch the logs now if we don't get run info on the
@@ -430,7 +490,9 @@ class LaunchAgent:
430
490
  job_and_run_status.run_queue_item_id, _msg, "run", fnames
431
491
  )
432
492
  else:
433
- _logger.info(f"Finish thread id {thread_id} had no exception and no run")
493
+ self._internal_logger.info(
494
+ f"Finish thread id {thread_id} had no exception and no run"
495
+ )
434
496
  wandb._sentry.exception(
435
497
  "launch agent called finish thread id on thread without run or exception"
436
498
  )
@@ -458,7 +520,7 @@ class LaunchAgent:
458
520
  await self.update_status(AGENT_RUNNING)
459
521
 
460
522
  # parse job
461
- _logger.info("Parsing launch spec")
523
+ self._internal_logger.info("Parsing launch spec")
462
524
  launch_spec = job["runSpec"]
463
525
 
464
526
  # Abort if this job attempts to override secure mode
@@ -511,6 +573,10 @@ class LaunchAgent:
511
573
  KeyboardInterrupt: if the agent is requested to stop.
512
574
  """
513
575
  self.print_status()
576
+ if self._verbosity == 0:
577
+ print_interval = DEFAULT_PRINT_INTERVAL
578
+ else:
579
+ print_interval = VERBOSE_PRINT_INTERVAL
514
580
  try:
515
581
  while True:
516
582
  job = None
@@ -532,7 +598,7 @@ class LaunchAgent:
532
598
  file_saver = RunQueueItemFileSaver(
533
599
  self._wandb_run, job["runQueueItemId"]
534
600
  )
535
- if _is_scheduler_job(job.get("runSpec", {})):
601
+ if self._is_scheduler_job(job.get("runSpec", {})):
536
602
  # If job is a scheduler, and we are already at the cap, ignore,
537
603
  # don't ack, and it will be pushed back onto the queue in 1 min
538
604
  if self.num_running_schedulers >= self._max_schedulers:
@@ -567,6 +633,7 @@ class LaunchAgent:
567
633
  await self.update_status(AGENT_POLLING)
568
634
  else:
569
635
  await self.update_status(AGENT_RUNNING)
636
+ if time.time() - self._last_status_print_time > print_interval:
570
637
  self.print_status()
571
638
 
572
639
  if self.num_running_jobs == self._max_jobs or job is None:
@@ -634,21 +701,21 @@ class LaunchAgent:
634
701
  await self.check_sweep_state(launch_spec, api)
635
702
 
636
703
  job_tracker.update_run_info(project)
637
- _logger.info("Fetching and validating project...")
704
+ self._internal_logger.info("Fetching and validating project...")
638
705
  project.fetch_and_validate_project()
639
- _logger.info("Fetching resource...")
706
+ self._internal_logger.info("Fetching resource...")
640
707
  resource = launch_spec.get("resource") or "local-container"
641
708
  backend_config: Dict[str, Any] = {
642
709
  PROJECT_SYNCHRONOUS: False, # agent always runs async
643
710
  }
644
- _logger.info("Loading backend")
711
+ self._internal_logger.info("Loading backend")
645
712
  override_build_config = launch_spec.get("builder")
646
713
 
647
714
  _, build_config, registry_config = construct_agent_configs(
648
715
  default_config, override_build_config
649
716
  )
650
717
  image_uri = project.docker_image
651
- entrypoint = project.get_single_entry_point()
718
+ entrypoint = project.get_job_entry_point()
652
719
  environment = loader.environment_from_config(
653
720
  default_config.get("environment", {})
654
721
  )
@@ -661,13 +728,13 @@ class LaunchAgent:
661
728
  assert entrypoint is not None
662
729
  image_uri = await builder.build_image(project, entrypoint, job_tracker)
663
730
 
664
- _logger.info("Backend loaded...")
731
+ self._internal_logger.info("Backend loaded...")
665
732
  if isinstance(backend, LocalProcessRunner):
666
733
  run = await backend.run(project, image_uri)
667
734
  else:
668
735
  assert image_uri
669
736
  run = await backend.run(project, image_uri)
670
- if _is_scheduler_job(launch_spec):
737
+ if self._is_scheduler_job(launch_spec):
671
738
  with self._jobs_lock:
672
739
  self._jobs[thread_id].is_scheduler = True
673
740
  wandb.termlog(
@@ -700,7 +767,7 @@ class LaunchAgent:
700
767
  if stopped_time is None:
701
768
  stopped_time = time.time()
702
769
  else:
703
- if time.time() - stopped_time > MAX_WAIT_RUN_STOPPED:
770
+ if time.time() - stopped_time > self._stopped_run_timeout:
704
771
  await run.cancel()
705
772
  await asyncio.sleep(AGENT_POLLING_INTERVAL)
706
773
 
@@ -720,7 +787,7 @@ class LaunchAgent:
720
787
  project=launch_spec["project"],
721
788
  )
722
789
  except Exception as e:
723
- _logger.debug(f"Fetch sweep state error: {e}")
790
+ self._internal_logger.debug(f"Fetch sweep state error: {e}")
724
791
  state = None
725
792
 
726
793
  if state != "RUNNING" and state != "PAUSED":
@@ -80,17 +80,7 @@ class RegistryConfig(BaseModel):
80
80
  @validator("uri") # type: ignore
81
81
  @classmethod
82
82
  def validate_uri(cls, uri: str) -> str:
83
- for regex in [
84
- GCP_ARTIFACT_REGISTRY_URI_REGEX,
85
- AZURE_CONTAINER_REGISTRY_URI_REGEX,
86
- ELASTIC_CONTAINER_REGISTRY_URI_REGEX,
87
- ]:
88
- if regex.match(uri):
89
- return uri
90
- raise ValueError(
91
- "Invalid uri. URI must be a repository URI for an "
92
- "ECR, ACR, or GCP Artifact Registry."
93
- )
83
+ return validate_registry_uri(uri)
94
84
 
95
85
 
96
86
  class EnvironmentConfig(BaseModel):
@@ -186,6 +176,14 @@ class BuilderConfig(BaseModel):
186
176
  """Right now there are no required fields for docker builds."""
187
177
  return values
188
178
 
179
+ @validator("destination") # type: ignore
180
+ @classmethod
181
+ def validate_destination(cls, destination: Optional[str]) -> Optional[str]:
182
+ """Validate that the destination is a valid container registry URI."""
183
+ if destination is None:
184
+ return None
185
+ return validate_registry_uri(destination)
186
+
189
187
 
190
188
  class AgentConfig(BaseModel):
191
189
  """Configuration for the Launch agent."""
@@ -194,9 +192,6 @@ class AgentConfig(BaseModel):
194
192
  default=[],
195
193
  description="The queues to use for this agent.",
196
194
  )
197
- project: Optional[str] = Field(
198
- description="The W&B project to use for this agent.",
199
- )
200
195
  entity: Optional[str] = Field(
201
196
  description="The W&B entity to use for this agent.",
202
197
  )
@@ -225,6 +220,77 @@ class AgentConfig(BaseModel):
225
220
  None,
226
221
  description="The builder to use.",
227
222
  )
223
+ verbosity: Optional[int] = Field(
224
+ 0,
225
+ description="How verbose to print, 0 = default, 1 = verbose, 2 = very verbose",
226
+ )
227
+ stopped_run_timeout: Optional[int] = Field(
228
+ 60,
229
+ description="How many seconds to wait after receiving the stop command before forcibly cancelling a run.",
230
+ )
228
231
 
229
232
  class Config:
230
233
  extra = "forbid"
234
+
235
+
236
+ def validate_registry_uri(uri: str) -> str:
237
+ """Validate that the registry URI is a valid container registry URI.
238
+
239
+ The URI should resolve to an image name in a container registry. The recognized
240
+ formats are for ECR, ACR, and GCP Artifact Registry. If the URI does not match
241
+ any of these formats, a warning is printed indicating the registry type is not
242
+ recognized and the agent can't guarantee that images can be pushed.
243
+
244
+ If the format is recognized but does not resolve to an image name, an
245
+ error is raised. For example, if the URI is an ECR URI but does not include
246
+ an image name or includes a tag as well as an image name, an error is raised.
247
+ """
248
+ tag_msg = (
249
+ "Destination for built images may not include a tag, but the URI provided "
250
+ "includes the suffix '{tag}'. Please remove the tag and try again. The agent "
251
+ "will automatically tag each image with a unique hash of the source code."
252
+ )
253
+ if uri.startswith("https://"):
254
+ uri = uri[8:]
255
+
256
+ match = GCP_ARTIFACT_REGISTRY_URI_REGEX.match(uri)
257
+ if match:
258
+ if match.group("tag"):
259
+ raise ValueError(tag_msg.format(tag=match.group("tag")))
260
+ if not match.group("image_name"):
261
+ raise ValueError(
262
+ "An image name must be specified in the URI for a GCP Artifact Registry. "
263
+ "Please provide a uri with the format "
264
+ "'https://<region>-docker.pkg.dev/<project>/<repository>/<image>'."
265
+ )
266
+ return uri
267
+
268
+ match = AZURE_CONTAINER_REGISTRY_URI_REGEX.match(uri)
269
+ if match:
270
+ if match.group("tag"):
271
+ raise ValueError(tag_msg.format(tag=match.group("tag")))
272
+ if not match.group("repository"):
273
+ raise ValueError(
274
+ "A repository name must be specified in the URI for an "
275
+ "Azure Container Registry. Please provide a uri with the format "
276
+ "'https://<registry-name>.azurecr.io/<repository>'."
277
+ )
278
+ return uri
279
+
280
+ match = ELASTIC_CONTAINER_REGISTRY_URI_REGEX.match(uri)
281
+ if match:
282
+ if match.group("tag"):
283
+ raise ValueError(tag_msg.format(tag=match.group("tag")))
284
+ if not match.group("repository"):
285
+ raise ValueError(
286
+ "A repository name must be specified in the URI for an "
287
+ "Elastic Container Registry. Please provide a uri with the format "
288
+ "'https://<account-id>.dkr.ecr.<region>.amazonaws.com/<repository>'."
289
+ )
290
+ return uri
291
+
292
+ wandb.termwarn(
293
+ f"Unable to recognize registry type in URI {uri}. You are responsible "
294
+ "for ensuring the agent can push images to this registry."
295
+ )
296
+ return uri
@@ -1,4 +1,5 @@
1
1
  """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
2
+
2
3
  from abc import ABC, abstractmethod
3
4
  from typing import TYPE_CHECKING, Any, Dict, Optional
4
5
 
@@ -6,6 +7,12 @@ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
6
7
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
7
8
 
8
9
  from .._project_spec import EntryPoint, LaunchProject
10
+ from ..registry.anon import AnonynmousRegistry
11
+ from ..utils import (
12
+ AZURE_CONTAINER_REGISTRY_URI_REGEX,
13
+ ELASTIC_CONTAINER_REGISTRY_URI_REGEX,
14
+ GCP_ARTIFACT_REGISTRY_URI_REGEX,
15
+ )
9
16
 
10
17
  if TYPE_CHECKING:
11
18
  from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
@@ -34,7 +41,7 @@ class AbstractBuilder(ABC):
34
41
  verify: Whether to verify the functionality of the builder.
35
42
 
36
43
  Raises:
37
- LaunchError: If the builder cannot be intialized or verified.
44
+ LaunchError: If the builder cannot be initialized or verified.
38
45
  """
39
46
  raise NotImplementedError
40
47
 
@@ -86,3 +93,64 @@ class AbstractBuilder(ABC):
86
93
  LaunchError: If the builder cannot be used to build images.
87
94
  """
88
95
  raise NotImplementedError
96
+
97
+
98
+ def registry_from_uri(uri: str) -> AbstractRegistry:
99
+ """Create a registry helper object from a uri.
100
+
101
+ This function parses the URI and determines which supported registry it
102
+ belongs to. It then creates a registry helper object for that registry.
103
+ The supported remote registry types are:
104
+ - Azure Container Registry
105
+ - Google Container Registry
106
+ - AWS Elastic Container Registry
107
+
108
+ The format of the URI is as follows:
109
+ - Azure Container Registry: <registry-name>.azurecr.io/<repo-name>/<image-name>
110
+ - Google Container Registry: <location>-docker.pkg.dev/<project-id>/<repo-name>/<image-name>
111
+ - AWS Elastic Container Registry: <account-id>.dkr.ecr.<region>.amazonaws.com/<repo-name>/<image-name>
112
+
113
+ Our classification of the registry is based on the domain name. For example,
114
+ if the uri contains `.azurecr.io`, we classify it as an Azure
115
+ Container Registry. If the uri contains `.dkr.ecr`, we classify
116
+ it as an AWS Elastic Container Registry. If the uri contains
117
+ `-docker.pkg.dev`, we classify it as a Google Artifact Registry.
118
+
119
+ This function will attempt to load the approriate cloud helpers for the
120
+
121
+ `https://` prefix is optional for all of the above.
122
+
123
+ Arguments:
124
+ uri: The uri to create a registry from.
125
+
126
+ Returns:
127
+ The registry.
128
+
129
+ Raises:
130
+ LaunchError: If the registry helper cannot be loaded for the given URI.
131
+ """
132
+ if uri.startswith("https://"):
133
+ uri = uri[len("https://") :]
134
+
135
+ if AZURE_CONTAINER_REGISTRY_URI_REGEX.match(uri) is not None:
136
+ from wandb.sdk.launch.registry.azure_container_registry import (
137
+ AzureContainerRegistry,
138
+ )
139
+
140
+ return AzureContainerRegistry(uri=uri)
141
+
142
+ elif GCP_ARTIFACT_REGISTRY_URI_REGEX.match(uri) is not None:
143
+ from wandb.sdk.launch.registry.google_artifact_registry import (
144
+ GoogleArtifactRegistry,
145
+ )
146
+
147
+ return GoogleArtifactRegistry(uri=uri)
148
+
149
+ elif ELASTIC_CONTAINER_REGISTRY_URI_REGEX.match(uri) is not None:
150
+ from wandb.sdk.launch.registry.elastic_container_registry import (
151
+ ElasticContainerRegistry,
152
+ )
153
+
154
+ return ElasticContainerRegistry(uri=uri)
155
+
156
+ return AnonynmousRegistry(uri=uri)