wandb 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. wandb/__init__.py +5 -1
  2. wandb/apis/public.py +137 -17
  3. wandb/apis/reports/_panels.py +1 -1
  4. wandb/apis/reports/blocks.py +1 -0
  5. wandb/apis/reports/report.py +27 -5
  6. wandb/cli/cli.py +52 -41
  7. wandb/docker/__init__.py +17 -0
  8. wandb/docker/auth.py +1 -1
  9. wandb/env.py +24 -4
  10. wandb/filesync/step_checksum.py +3 -3
  11. wandb/integration/openai/openai.py +3 -0
  12. wandb/integration/ultralytics/__init__.py +9 -0
  13. wandb/integration/ultralytics/bbox_utils.py +196 -0
  14. wandb/integration/ultralytics/callback.py +458 -0
  15. wandb/integration/ultralytics/classification_utils.py +66 -0
  16. wandb/integration/ultralytics/mask_utils.py +141 -0
  17. wandb/integration/ultralytics/pose_utils.py +92 -0
  18. wandb/integration/xgboost/xgboost.py +3 -3
  19. wandb/integration/yolov8/__init__.py +0 -7
  20. wandb/integration/yolov8/yolov8.py +22 -3
  21. wandb/old/settings.py +7 -0
  22. wandb/plot/line_series.py +0 -1
  23. wandb/proto/v3/wandb_internal_pb2.py +353 -300
  24. wandb/proto/v3/wandb_server_pb2.py +37 -41
  25. wandb/proto/v3/wandb_settings_pb2.py +2 -2
  26. wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
  27. wandb/proto/v4/wandb_internal_pb2.py +272 -260
  28. wandb/proto/v4/wandb_server_pb2.py +37 -40
  29. wandb/proto/v4/wandb_settings_pb2.py +2 -2
  30. wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
  31. wandb/proto/wandb_internal_codegen.py +7 -31
  32. wandb/sdk/artifacts/artifact.py +321 -189
  33. wandb/sdk/artifacts/artifact_cache.py +14 -0
  34. wandb/sdk/artifacts/artifact_manifest.py +5 -4
  35. wandb/sdk/artifacts/artifact_manifest_entry.py +37 -9
  36. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -9
  37. wandb/sdk/artifacts/artifact_saver.py +13 -50
  38. wandb/sdk/artifacts/artifact_ttl.py +6 -0
  39. wandb/sdk/artifacts/artifacts_cache.py +119 -93
  40. wandb/sdk/artifacts/staging.py +25 -0
  41. wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
  42. wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -3
  43. wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
  44. wandb/sdk/artifacts/storage_policies/register.py +1 -0
  45. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +4 -3
  46. wandb/sdk/artifacts/storage_policy.py +4 -2
  47. wandb/sdk/backend/backend.py +0 -16
  48. wandb/sdk/data_types/image.py +3 -1
  49. wandb/sdk/integration_utils/auto_logging.py +38 -13
  50. wandb/sdk/interface/interface.py +16 -135
  51. wandb/sdk/interface/interface_shared.py +9 -147
  52. wandb/sdk/interface/interface_sock.py +0 -26
  53. wandb/sdk/internal/file_pusher.py +20 -3
  54. wandb/sdk/internal/file_stream.py +3 -1
  55. wandb/sdk/internal/handler.py +53 -70
  56. wandb/sdk/internal/internal_api.py +220 -130
  57. wandb/sdk/internal/job_builder.py +41 -37
  58. wandb/sdk/internal/sender.py +7 -25
  59. wandb/sdk/internal/system/assets/disk.py +144 -11
  60. wandb/sdk/internal/system/system_info.py +6 -2
  61. wandb/sdk/launch/__init__.py +5 -0
  62. wandb/sdk/launch/{launch.py → _launch.py} +53 -54
  63. wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
  64. wandb/sdk/launch/_project_spec.py +13 -2
  65. wandb/sdk/launch/agent/agent.py +103 -59
  66. wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
  67. wandb/sdk/launch/builder/build.py +19 -1
  68. wandb/sdk/launch/builder/docker_builder.py +5 -1
  69. wandb/sdk/launch/builder/kaniko_builder.py +5 -1
  70. wandb/sdk/launch/create_job.py +20 -5
  71. wandb/sdk/launch/loader.py +14 -5
  72. wandb/sdk/launch/runner/abstract.py +0 -2
  73. wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
  74. wandb/sdk/launch/runner/kubernetes_runner.py +66 -209
  75. wandb/sdk/launch/runner/local_container.py +5 -2
  76. wandb/sdk/launch/runner/local_process.py +4 -1
  77. wandb/sdk/launch/sweeps/scheduler.py +43 -25
  78. wandb/sdk/launch/sweeps/utils.py +5 -3
  79. wandb/sdk/launch/utils.py +3 -1
  80. wandb/sdk/lib/_settings_toposort_generate.py +3 -9
  81. wandb/sdk/lib/_settings_toposort_generated.py +27 -3
  82. wandb/sdk/lib/_wburls_generated.py +1 -0
  83. wandb/sdk/lib/filenames.py +27 -6
  84. wandb/sdk/lib/filesystem.py +181 -7
  85. wandb/sdk/lib/fsm.py +5 -3
  86. wandb/sdk/lib/gql_request.py +3 -0
  87. wandb/sdk/lib/ipython.py +7 -0
  88. wandb/sdk/lib/wburls.py +1 -0
  89. wandb/sdk/service/port_file.py +2 -15
  90. wandb/sdk/service/server.py +7 -55
  91. wandb/sdk/service/service.py +56 -26
  92. wandb/sdk/service/service_base.py +1 -1
  93. wandb/sdk/service/streams.py +11 -5
  94. wandb/sdk/verify/verify.py +2 -2
  95. wandb/sdk/wandb_init.py +8 -2
  96. wandb/sdk/wandb_manager.py +4 -14
  97. wandb/sdk/wandb_run.py +143 -53
  98. wandb/sdk/wandb_settings.py +148 -35
  99. wandb/testing/relay.py +85 -38
  100. wandb/util.py +87 -4
  101. wandb/wandb_torch.py +24 -38
  102. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/METADATA +48 -23
  103. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/RECORD +107 -103
  104. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/WHEEL +1 -1
  105. wandb/proto/v3/wandb_server_pb2_grpc.py +0 -1422
  106. wandb/proto/v4/wandb_server_pb2_grpc.py +0 -1422
  107. wandb/proto/wandb_server_pb2_grpc.py +0 -8
  108. wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +0 -61
  109. wandb/sdk/interface/interface_grpc.py +0 -460
  110. wandb/sdk/service/server_grpc.py +0 -444
  111. wandb/sdk/service/service_grpc.py +0 -73
  112. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
  113. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
  114. {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import pprint
2
2
  from typing import Any, Dict, List, Optional
3
3
 
4
4
  import wandb
5
- import wandb.apis.public as public
5
+ from wandb.apis import public
6
6
  from wandb.apis.internal import Api
7
7
  from wandb.sdk.launch._project_spec import create_project_from_spec
8
8
  from wandb.sdk.launch.builder.build import build_image_from_project
@@ -49,39 +49,42 @@ def launch_add(
49
49
  """Enqueue a W&B launch experiment. With either a source uri, job or docker_image.
50
50
 
51
51
  Arguments:
52
- uri: URI of experiment to run. A wandb run uri or a Git repository URI.
53
- job: string reference to a wandb.Job eg: wandb/test/my-job:latest
54
- config: A dictionary containing the configuration for the run. May also contain
55
- resource specific arguments under the key "resource_args"
56
- project: Target project to send launched run to
57
- entity: Target entity to send launched run to
58
- queue: the name of the queue to enqueue the run to
59
- resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
60
- entry_point: Entry point to run within the project. Defaults to using the entry point used
61
- in the original run for wandb URIs, or main.py for git repository URIs.
62
- name: Name run under which to launch the run.
63
- version: For Git-based projects, either a commit hash or a branch name.
64
- docker_image: The name of the docker image to use for the run.
65
- resource_args: Resource related arguments for launching runs onto a remote backend.
66
- Will be stored on the constructed launch config under ``resource_args``.
67
- run_id: optional string indicating the id of the launched run
68
- build: optional flag defaulting to false, requires queue to be set
69
- if build, an image is created, creates a job artifact, pushes a reference
70
- to that job artifact to queue
71
- repository: optional string to control the name of the remote repository, used when
72
- pushing images to a registry
73
- project_queue: optional string to control the name of the project for the queue. Primarily used
74
- for back compatibility with project scoped queues
52
+ uri: URI of experiment to run. A wandb run uri or a Git repository URI.
53
+ job: string reference to a wandb.Job eg: wandb/test/my-job:latest
54
+ config: A dictionary containing the configuration for the run. May also contain
55
+ resource specific arguments under the key "resource_args"
56
+ project: Target project to send launched run to
57
+ entity: Target entity to send launched run to
58
+ queue: the name of the queue to enqueue the run to
59
+ resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
60
+ entry_point: Entry point to run within the project. Defaults to using the entry point used
61
+ in the original run for wandb URIs, or main.py for git repository URIs.
62
+ name: Name run under which to launch the run.
63
+ version: For Git-based projects, either a commit hash or a branch name.
64
+ docker_image: The name of the docker image to use for the run.
65
+ resource_args: Resource related arguments for launching runs onto a remote backend.
66
+ Will be stored on the constructed launch config under ``resource_args``.
67
+ run_id: optional string indicating the id of the launched run
68
+ build: optional flag defaulting to false, requires queue to be set
69
+ if build, an image is created, creates a job artifact, pushes a reference
70
+ to that job artifact to queue
71
+ repository: optional string to control the name of the remote repository, used when
72
+ pushing images to a registry
73
+ project_queue: optional string to control the name of the project for the queue. Primarily used
74
+ for back compatibility with project scoped queues
75
75
 
76
76
 
77
77
  Example:
78
- import wandb
79
- project_uri = "https://github.com/wandb/examples"
80
- params = {"alpha": 0.5, "l1_ratio": 0.01}
81
- # Run W&B project and create a reproducible docker environment
82
- # on a local host
83
- api = wandb.apis.internal.Api()
84
- wandb.launch_add(uri=project_uri, parameters=params)
78
+ ```python
79
+ from wandb.sdk.launch import launch_add
80
+
81
+ project_uri = "https://github.com/wandb/examples"
82
+ params = {"alpha": 0.5, "l1_ratio": 0.01}
83
+ # Run W&B project and create a reproducible docker environment
84
+ # on a local host
85
+ api = wandb.apis.internal.Api()
86
+ launch_add(uri=project_uri, parameters=params)
87
+ ```
85
88
 
86
89
 
87
90
  Returns:
@@ -106,6 +106,7 @@ class LaunchProject:
106
106
  self.override_config: Dict[str, Any] = overrides.get("run_config", {})
107
107
  self.override_artifacts: Dict[str, Any] = overrides.get("artifacts", {})
108
108
  self.override_entrypoint: Optional[EntryPoint] = None
109
+ self.override_dockerfile: Optional[str] = overrides.get("dockerfile")
109
110
  self.deps_type: Optional[str] = None
110
111
  self._runtime: Optional[str] = None
111
112
  self.run_id = run_id or generate_id()
@@ -117,7 +118,8 @@ class LaunchProject:
117
118
  if override_entrypoint:
118
119
  _logger.info("Adding override entry point")
119
120
  self.override_entrypoint = EntryPoint(
120
- " ".join(override_entrypoint[0]), override_entrypoint
121
+ name=self._get_entrypoint_file(override_entrypoint),
122
+ command=override_entrypoint,
121
123
  )
122
124
 
123
125
  if overrides.get("sweep_id") is not None:
@@ -185,6 +187,15 @@ class LaunchProject:
185
187
  assert self.job is not None
186
188
  return wandb.util.make_docker_image_name_safe(self.job.split(":")[0])
187
189
 
190
+ def _get_entrypoint_file(self, entrypoint: List[str]) -> Optional[str]:
191
+ if not entrypoint:
192
+ return None
193
+ if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
194
+ return entrypoint[0]
195
+ if len(entrypoint) < 2:
196
+ return None
197
+ return entrypoint[1]
198
+
188
199
  def fill_macros(self, image: str) -> Dict[str, Any]:
189
200
  """Substitute values for macros in resource arguments.
190
201
 
@@ -415,7 +426,7 @@ class LaunchProject:
415
426
  class EntryPoint:
416
427
  """An entry point into a wandb launch specification."""
417
428
 
418
- def __init__(self, name: str, command: List[str]):
429
+ def __init__(self, name: Optional[str], command: List[str]):
419
430
  self.name = name
420
431
  self.command = command
421
432
 
@@ -6,13 +6,12 @@ import threading
6
6
  import time
7
7
  import traceback
8
8
  from multiprocessing import Event
9
- from multiprocessing.pool import ThreadPool
10
9
  from typing import Any, Dict, List, Optional, Union
11
10
 
12
11
  import wandb
13
12
  from wandb.apis.internal import Api
14
13
  from wandb.errors import CommError
15
- from wandb.sdk.launch.launch_add import launch_add
14
+ from wandb.sdk.launch._launch_add import launch_add
16
15
  from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
17
16
  from wandb.sdk.launch.runner.local_process import LocalProcessRunner
18
17
  from wandb.sdk.launch.sweeps.scheduler import Scheduler
@@ -35,9 +34,21 @@ AGENT_KILLED = "KILLED"
35
34
 
36
35
  HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
37
36
 
38
- MAX_THREADS = 64
39
37
  MAX_RESUME_COUNT = 5
40
38
 
39
+ RUN_INFO_GRACE_PERIOD = 60
40
+
41
+ _env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
42
+ if _env_timeout:
43
+ try:
44
+ RUN_START_TIMEOUT = float(_env_timeout)
45
+ except ValueError:
46
+ raise LaunchError(
47
+ f"Invalid value for WANDB_LAUNCH_START_TIMEOUT: {_env_timeout}"
48
+ )
49
+ else:
50
+ RUN_START_TIMEOUT = 60 * 30 # default 30 minutes
51
+
41
52
  _logger = logging.getLogger(__name__)
42
53
 
43
54
 
@@ -129,13 +140,15 @@ class LaunchAgent:
129
140
  self._access = _convert_access("project")
130
141
  self._max_jobs = _max_from_config(config, "max_jobs")
131
142
  self._max_schedulers = _max_from_config(config, "max_schedulers")
132
- self._pool = ThreadPool(
133
- processes=int(min(MAX_THREADS, self._max_jobs + self._max_schedulers)),
134
- initargs=(self._jobs, self._jobs_lock),
135
- )
136
143
  self._secure_mode = config.get("secure_mode", False)
137
144
  self.default_config: Dict[str, Any] = config
138
145
 
146
+ # Get agent version from env var if present, otherwise wandb version
147
+ self.version: str = "wandb@" + wandb.__version__
148
+ env_agent_version = os.environ.get("WANDB_AGENT_VERSION")
149
+ if env_agent_version and env_agent_version != "wandb-launch-agent":
150
+ self.version = env_agent_version
151
+
139
152
  # serverside creation
140
153
  self.gorilla_supports_agents = (
141
154
  self._api.launch_agent_introspection() is not None
@@ -150,6 +163,7 @@ class LaunchAgent:
150
163
  self._project,
151
164
  self._queues,
152
165
  self.default_config,
166
+ self.version,
153
167
  self.gorilla_supports_agents,
154
168
  )
155
169
  self._id = create_response["launchAgentId"]
@@ -289,27 +303,43 @@ class LaunchAgent:
289
303
  job_and_run_status.err_stage,
290
304
  fnames,
291
305
  )
292
- elif job_and_run_status.completed_status not in ["stopped", "failed"]:
293
- _logger.info(
294
- "Skipping check for completed run status because run was successful"
295
- )
296
306
  elif job_and_run_status.run is not None:
297
307
  run_info = None
298
- # sweep runs exist but have no info before they are started
299
- # so run_info returned will be None
300
- # normal runs just throw a comm error
301
- # TODO: make more clear
302
- try:
303
- run_info = self._api.get_run_info(
304
- self._entity, job_and_run_status.project, job_and_run_status.run_id
305
- )
308
+ # We do some weird stuff here getting run info to check for a
309
+ # created in run in W&B.
310
+ #
311
+ # We retry for 60 seconds with an exponential backoff in case
312
+ # upsert run is taking a while.
313
+ #
314
+ # Sweep runs exist but have no info before they are started
315
+ # so run_info returned will be None, while normal runs just throw a
316
+ # comm error.
317
+ start_time = time.time()
318
+ interval = 1
319
+ while True:
320
+ try:
321
+ run_info = self._api.get_run_info(
322
+ self._entity,
323
+ job_and_run_status.project,
324
+ job_and_run_status.run_id,
325
+ )
326
+ except CommError:
327
+ pass
328
+ if (
329
+ run_info is not None
330
+ or time.time() - start_time > RUN_INFO_GRACE_PERIOD
331
+ ):
332
+ break
333
+ if run_info is None:
334
+ time.sleep(interval)
335
+ interval *= 2
306
336
 
307
- except CommError:
308
- pass
309
337
  if run_info is None:
310
- _msg = "The submitted run was not successfully started"
311
338
  fnames = None
312
-
339
+ if job_and_run_status.completed_status == "finished":
340
+ _msg = "The submitted job exited successfully but failed to call wandb.init"
341
+ else:
342
+ _msg = "The submitted run was not successfully started"
313
343
  logs = job_and_run_status.run.get_logs()
314
344
  if logs:
315
345
  fnames = job_and_run_status.saver.save_contents(
@@ -319,7 +349,7 @@ class LaunchAgent:
319
349
  job_and_run_status.run_queue_item_id, _msg, "run", fnames
320
350
  )
321
351
  else:
322
- _logger.info("Finish thread id had no exception, ror run")
352
+ _logger.info(f"Finish thread id {thread_id} had no exception and no run")
323
353
  wandb._sentry.exception(
324
354
  "launch agent called finish thread id on thread without run or exception"
325
355
  )
@@ -359,19 +389,21 @@ class LaunchAgent:
359
389
 
360
390
  # Abort if this job attempts to override secure mode
361
391
  self._assert_secure(launch_spec)
362
-
363
- self._pool.apply_async(
364
- self.thread_run_job,
365
- (
392
+ job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
393
+ t = threading.Thread(
394
+ target=self.thread_run_job,
395
+ args=(
366
396
  launch_spec,
367
397
  job,
368
398
  self.default_config,
369
399
  self._api,
370
- queue,
371
- file_saver,
400
+ job_tracker,
372
401
  ),
402
+ daemon=True,
373
403
  )
374
404
 
405
+ t.start()
406
+
375
407
  def _assert_secure(self, launch_spec: Dict[str, Any]) -> None:
376
408
  """If secure mode is set, make sure no vulnerable keys are overridden."""
377
409
  if not self._secure_mode:
@@ -422,21 +454,23 @@ class LaunchAgent:
422
454
  for queue in self._queues:
423
455
  job = self.pop_from_queue(queue)
424
456
  if job:
425
- file_saver = RunQueueItemFileSaver(
426
- self._wandb_run, job["runQueueItemId"]
427
- )
428
- if _is_scheduler_job(job.get("runSpec")):
429
- # If job is a scheduler, and we are already at the cap, ignore,
430
- # don't ack, and it will be pushed back onto the queue in 1 min
431
- if self.num_running_schedulers >= self._max_schedulers:
432
- wandb.termwarn(
433
- f"{LOG_PREFIX}Agent already running the maximum number "
434
- f"of sweep schedulers: {self._max_schedulers}. To set "
435
- "this value use `max_schedulers` key in the agent config"
436
- )
437
- continue
438
-
439
457
  try:
458
+ file_saver = RunQueueItemFileSaver(
459
+ self._wandb_run, job["runQueueItemId"]
460
+ )
461
+ if _is_scheduler_job(job.get("runSpec")):
462
+ # If job is a scheduler, and we are already at the cap, ignore,
463
+ # don't ack, and it will be pushed back onto the queue in 1 min
464
+ if (
465
+ self.num_running_schedulers
466
+ >= self._max_schedulers
467
+ ):
468
+ wandb.termwarn(
469
+ f"{LOG_PREFIX}Agent already running the maximum number "
470
+ f"of sweep schedulers: {self._max_schedulers}. To set "
471
+ "this value use `max_schedulers` key in the agent config"
472
+ )
473
+ continue
440
474
  self.run_job(job, queue, file_saver)
441
475
  except Exception as e:
442
476
  wandb.termerror(
@@ -480,8 +514,6 @@ class LaunchAgent:
480
514
  self.update_status(AGENT_KILLED)
481
515
  wandb.termlog(f"{LOG_PREFIX}Shutting down, active jobs:")
482
516
  self.print_status()
483
- self._pool.close()
484
- self._pool.join()
485
517
 
486
518
  # Threaded functions
487
519
  def thread_run_job(
@@ -490,15 +522,13 @@ class LaunchAgent:
490
522
  job: Dict[str, Any],
491
523
  default_config: Dict[str, Any],
492
524
  api: Api,
493
- queue: str,
494
- file_saver: RunQueueItemFileSaver,
525
+ job_tracker: JobAndRunStatusTracker,
495
526
  ) -> None:
496
527
  thread_id = threading.current_thread().ident
497
- assert thread_id is not None
498
- job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
499
- with self._jobs_lock:
500
- self._jobs[thread_id] = job_tracker
528
+ assert thread_id
501
529
  try:
530
+ with self._jobs_lock:
531
+ self._jobs[thread_id] = job_tracker
502
532
  self._thread_run_job(
503
533
  launch_spec, job, default_config, api, thread_id, job_tracker
504
534
  )
@@ -540,7 +570,7 @@ class LaunchAgent:
540
570
  _logger.debug(f"Fetch sweep state error: {e}")
541
571
  state = None
542
572
 
543
- if state and state != "RUNNING" and state != "PAUSED":
573
+ if state != "RUNNING" and state != "PAUSED":
544
574
  raise LaunchError(
545
575
  f"Launch agent picked up sweep job, but sweep ({launch_spec['sweep_id']}) was in a terminal state ({state})"
546
576
  )
@@ -594,7 +624,18 @@ class LaunchAgent:
594
624
  return
595
625
  with self._jobs_lock:
596
626
  job_tracker.run = run
627
+ start_time = time.time()
597
628
  while self._jobs_event.is_set():
629
+ # If run has failed to start before timeout, kill it
630
+ state = run.get_status().state
631
+ if state == "starting" and RUN_START_TIMEOUT > 0:
632
+ if time.time() - start_time > RUN_START_TIMEOUT:
633
+ run.cancel()
634
+ raise LaunchError(
635
+ f"Run failed to start within {RUN_START_TIMEOUT} seconds. "
636
+ "If you want to increase this timeout, set WANDB_LAUNCH_START_TIMEOUT "
637
+ "to a larger value."
638
+ )
598
639
  if self._check_run_finished(job_tracker, launch_spec):
599
640
  return
600
641
  time.sleep(AGENT_POLLING_INTERVAL)
@@ -655,12 +696,15 @@ class LaunchAgent:
655
696
  wandb.termlog(f"{LOG_PREFIX}Scheduler finished with ID: {run.id}")
656
697
  if status == "failed":
657
698
  # on fail, update sweep state. scheduler run_id should == sweep_id
658
- self._api.set_sweep_state(
659
- sweep=job_tracker.run_id,
660
- entity=job_tracker.entity,
661
- project=job_tracker.project,
662
- state="CANCELED",
663
- )
699
+ try:
700
+ self._api.set_sweep_state(
701
+ sweep=job_tracker.run_id,
702
+ entity=job_tracker.entity,
703
+ project=job_tracker.project,
704
+ state="CANCELED",
705
+ )
706
+ except Exception as e:
707
+ raise LaunchError(f"Failed to update sweep state: {e}")
664
708
  else:
665
709
  wandb.termlog(f"{LOG_PREFIX}Job finished with ID: {run.id}")
666
710
  with self._jobs_lock:
@@ -5,8 +5,6 @@ import sys
5
5
  from typing import List, Optional, Union
6
6
 
7
7
  import wandb
8
- from wandb.sdk.lib import RunDisabled
9
- from wandb.sdk.wandb_run import Run
10
8
 
11
9
  if sys.version_info >= (3, 8):
12
10
  from typing import Literal
@@ -18,7 +16,11 @@ FileSubtypes = Literal["warning", "error"]
18
16
 
19
17
  class RunQueueItemFileSaver:
20
18
  def __init__(
21
- self, agent_run: Optional[Union[Run, RunDisabled]], run_queue_item_id: str
19
+ self,
20
+ agent_run: Optional[
21
+ Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
22
+ ],
23
+ run_queue_item_id: str,
22
24
  ):
23
25
  self.run_queue_item_id = run_queue_item_id
24
26
  self.run = agent_run
@@ -26,7 +28,7 @@ class RunQueueItemFileSaver:
26
28
  def save_contents(
27
29
  self, contents: str, fname: str, file_sub_type: FileSubtypes
28
30
  ) -> Optional[List[str]]:
29
- if not isinstance(self.run, Run):
31
+ if not isinstance(self.run, wandb.sdk.wandb_run.Run):
30
32
  wandb.termwarn("Not saving file contents because agent has no run")
31
33
  return None
32
34
  root_dir = self.run._settings.files_dir
@@ -36,6 +36,7 @@ _logger = logging.getLogger(__name__)
36
36
 
37
37
 
38
38
  _GENERATED_DOCKERFILE_NAME = "Dockerfile.wandb-autogenerated"
39
+ _DEFAULT_DOCKERFILE_NAME = "Dockerfile.wandb"
39
40
 
40
41
 
41
42
  def validate_docker_installation() -> None:
@@ -237,7 +238,7 @@ def get_env_vars_dict(
237
238
  if launch_project.sweep_id:
238
239
  env_vars["WANDB_SWEEP_ID"] = launch_project.sweep_id
239
240
  if launch_project.launch_spec.get("_resume_count", 0) > 0:
240
- env_vars["WANDB_RESUME"] = "must"
241
+ env_vars["WANDB_RESUME"] = "allow"
241
242
 
242
243
  _inject_wandb_config_env_vars(
243
244
  launch_project.override_config, env_vars, max_env_length
@@ -321,7 +322,24 @@ def generate_dockerfile(
321
322
  entry_point: EntryPoint,
322
323
  runner_type: str,
323
324
  builder_type: str,
325
+ dockerfile: Optional[str] = None,
324
326
  ) -> str:
327
+ override_entrypoint = launch_project.override_entrypoint or entry_point
328
+ if launch_project.project_dir is not None:
329
+ if not dockerfile and override_entrypoint.name is not None:
330
+ entrypoint_dir = os.path.dirname(override_entrypoint.name)
331
+ path = os.path.join(
332
+ launch_project.project_dir, entrypoint_dir, _DEFAULT_DOCKERFILE_NAME
333
+ )
334
+ if os.path.exists(path):
335
+ dockerfile = os.path.join(entrypoint_dir, _DEFAULT_DOCKERFILE_NAME)
336
+ if dockerfile:
337
+ path = os.path.join(launch_project.project_dir, dockerfile)
338
+ if not os.path.exists(path):
339
+ raise LaunchError(f"Dockerfile does not exist at {path}")
340
+ wandb.termlog(f"Using dockerfile: {dockerfile}")
341
+ return open(path).read()
342
+
325
343
  # get python versions truncated to major.minor to ensure image availability
326
344
  if launch_project.python_version:
327
345
  spl = launch_project.python_version.split(".")[:2]
@@ -121,7 +121,11 @@ class DockerBuilder(AbstractBuilder):
121
121
  entrypoint (EntryPoint): The entrypoint to use.
122
122
  """
123
123
  dockerfile_str = generate_dockerfile(
124
- launch_project, entrypoint, launch_project.resource, "docker"
124
+ launch_project=launch_project,
125
+ entry_point=entrypoint,
126
+ runner_type=launch_project.resource,
127
+ builder_type="docker",
128
+ dockerfile=launch_project.override_dockerfile,
125
129
  )
126
130
 
127
131
  image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
@@ -241,7 +241,11 @@ class KanikoBuilder(AbstractBuilder):
241
241
  raise LaunchError("No registry specified for Kaniko build.")
242
242
  # kaniko builder doesn't seem to work with a custom user id, need more investigation
243
243
  dockerfile_str = generate_dockerfile(
244
- launch_project, entrypoint, launch_project.resource, "kaniko"
244
+ launch_project=launch_project,
245
+ entry_point=entrypoint,
246
+ runner_type=launch_project.resource,
247
+ builder_type="kaniko",
248
+ dockerfile=launch_project.override_dockerfile,
245
249
  )
246
250
  image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
247
251
  repo_uri = self.registry.get_repo_uri()
@@ -63,7 +63,7 @@ def create_job(
63
63
  runtime="3.9",
64
64
  entrypoint="train.py",
65
65
  )
66
- # then, use you newly created job
66
+ # then run the newly created job
67
67
  artifact_job.call()
68
68
  ```
69
69
  """
@@ -180,7 +180,6 @@ def _create_job(
180
180
  run_name=run.id, # run will be deleted after creation
181
181
  description=description,
182
182
  metadata=metadata,
183
- labels=["manually-created"],
184
183
  is_user_created=True,
185
184
  aliases=[{"artifactCollectionName": name, "alias": a} for a in aliases],
186
185
  )
@@ -335,19 +334,33 @@ def _create_repo_metadata(
335
334
  entrypoint = rel_entrypoint
336
335
 
337
336
  # check if requirements.txt exists
338
- if not os.path.exists(os.path.join(local_dir, "requirements.txt")):
339
- repo_formd = path.replace(entrypoint, "")
337
+ # start at the location of the python file and recurse up to the git root
338
+ req_dir = local_dir
339
+ while (
340
+ not os.path.exists(os.path.join(req_dir, "requirements.txt"))
341
+ and req_dir != tempdir
342
+ ):
343
+ req_dir = os.path.dirname(req_dir)
344
+
345
+ if not os.path.exists(os.path.join(req_dir, "requirements.txt")):
340
346
  wandb.termerror(
341
- f"Could not find requirements.txt file in git repo at: {repo_formd}/requirements.txt"
347
+ "Could not find requirements.txt file in git repo at "
348
+ f"{os.path.join(os.path.dirname(path), 'requirements.txt')} "
349
+ "or parent directories."
342
350
  )
343
351
  return None
344
352
 
353
+ wandb.termlog(
354
+ f"Using requirements.txt in {req_dir.replace(tempdir, '') or 'repository root'}"
355
+ )
356
+
345
357
  metadata = {
346
358
  "git": {
347
359
  "commit": commit,
348
360
  "remote": ref.url,
349
361
  },
350
362
  "root": ref.repo,
363
+ "codePathLocal": entrypoint, # not in git context, optionally also set local
351
364
  "codePath": entrypoint,
352
365
  "entrypoint": [f"python{python_version}", entrypoint],
353
366
  "python": python_version, # used to build container
@@ -426,6 +439,8 @@ def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuild
426
439
  job_builder = JobBuilder(
427
440
  settings=settings,
428
441
  )
442
+ # never allow notebook runs
443
+ job_builder._is_notebook_run = False
429
444
  # set run inputs and outputs to empty dicts
430
445
  job_builder.set_config({})
431
446
  job_builder.set_summary({})
@@ -3,6 +3,7 @@ from typing import Any, Dict, Optional
3
3
 
4
4
  import wandb
5
5
  from wandb.apis.internal import Api
6
+ from wandb.docker import is_docker_installed
6
7
  from wandb.sdk.launch.errors import LaunchError
7
8
 
8
9
  from .builder.abstract import AbstractBuilder
@@ -141,7 +142,10 @@ def builder_from_config(
141
142
  This helper function is used to create a builder from a config. The
142
143
  config should have a "type" key that specifies the type of builder to import
143
144
  and create. The remaining keys are passed to the builder's from_config
144
- method. If the config is None or empty, a DockerBuilder is returned.
145
+ method. If the config is None or empty, a default builder is returned.
146
+
147
+ The default builder will be a DockerBuilder if we find a working docker cli
148
+ on the system, otherwise it will be a NoOpBuilder.
145
149
 
146
150
  Arguments:
147
151
  config (Dict[str, Any]): The builder config.
@@ -154,11 +158,16 @@ def builder_from_config(
154
158
  LaunchError: If the builder is not configured correctly.
155
159
  """
156
160
  if not config:
157
- from .builder.docker_builder import DockerBuilder
161
+ if is_docker_installed():
162
+ from .builder.docker_builder import DockerBuilder
163
+
164
+ return DockerBuilder.from_config(
165
+ {}, environment, registry
166
+ ) # This is the default builder.
167
+
168
+ from .builder.noop import NoOpBuilder
158
169
 
159
- return DockerBuilder.from_config(
160
- {}, environment, registry
161
- ) # This is the default builder.
170
+ return NoOpBuilder.from_config({}, environment, registry)
162
171
 
163
172
  builder_type = config.get("type")
164
173
  if builder_type is None:
@@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Union
13
13
  from dockerpycreds.utils import find_executable # type: ignore
14
14
 
15
15
  import wandb
16
- from wandb import Settings
17
16
  from wandb.apis.internal import Api
18
17
  from wandb.sdk.lib import runid
19
18
 
@@ -136,7 +135,6 @@ class AbstractRunner(ABC):
136
135
  api: Api,
137
136
  backend_config: Dict[str, Any],
138
137
  ) -> None:
139
- self._settings = Settings()
140
138
  self._api = api
141
139
  self.backend_config = backend_config
142
140
  self._cwd = os.getcwd()