wandb 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +5 -1
- wandb/apis/public.py +137 -17
- wandb/apis/reports/_panels.py +1 -1
- wandb/apis/reports/blocks.py +1 -0
- wandb/apis/reports/report.py +27 -5
- wandb/cli/cli.py +52 -41
- wandb/docker/__init__.py +17 -0
- wandb/docker/auth.py +1 -1
- wandb/env.py +24 -4
- wandb/filesync/step_checksum.py +3 -3
- wandb/integration/openai/openai.py +3 -0
- wandb/integration/ultralytics/__init__.py +9 -0
- wandb/integration/ultralytics/bbox_utils.py +196 -0
- wandb/integration/ultralytics/callback.py +458 -0
- wandb/integration/ultralytics/classification_utils.py +66 -0
- wandb/integration/ultralytics/mask_utils.py +141 -0
- wandb/integration/ultralytics/pose_utils.py +92 -0
- wandb/integration/xgboost/xgboost.py +3 -3
- wandb/integration/yolov8/__init__.py +0 -7
- wandb/integration/yolov8/yolov8.py +22 -3
- wandb/old/settings.py +7 -0
- wandb/plot/line_series.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +353 -300
- wandb/proto/v3/wandb_server_pb2.py +37 -41
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
- wandb/proto/v4/wandb_internal_pb2.py +272 -260
- wandb/proto/v4/wandb_server_pb2.py +37 -40
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
- wandb/proto/wandb_internal_codegen.py +7 -31
- wandb/sdk/artifacts/artifact.py +321 -189
- wandb/sdk/artifacts/artifact_cache.py +14 -0
- wandb/sdk/artifacts/artifact_manifest.py +5 -4
- wandb/sdk/artifacts/artifact_manifest_entry.py +37 -9
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -9
- wandb/sdk/artifacts/artifact_saver.py +13 -50
- wandb/sdk/artifacts/artifact_ttl.py +6 -0
- wandb/sdk/artifacts/artifacts_cache.py +119 -93
- wandb/sdk/artifacts/staging.py +25 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -3
- wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
- wandb/sdk/artifacts/storage_policies/register.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +4 -3
- wandb/sdk/artifacts/storage_policy.py +4 -2
- wandb/sdk/backend/backend.py +0 -16
- wandb/sdk/data_types/image.py +3 -1
- wandb/sdk/integration_utils/auto_logging.py +38 -13
- wandb/sdk/interface/interface.py +16 -135
- wandb/sdk/interface/interface_shared.py +9 -147
- wandb/sdk/interface/interface_sock.py +0 -26
- wandb/sdk/internal/file_pusher.py +20 -3
- wandb/sdk/internal/file_stream.py +3 -1
- wandb/sdk/internal/handler.py +53 -70
- wandb/sdk/internal/internal_api.py +220 -130
- wandb/sdk/internal/job_builder.py +41 -37
- wandb/sdk/internal/sender.py +7 -25
- wandb/sdk/internal/system/assets/disk.py +144 -11
- wandb/sdk/internal/system/system_info.py +6 -2
- wandb/sdk/launch/__init__.py +5 -0
- wandb/sdk/launch/{launch.py → _launch.py} +53 -54
- wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
- wandb/sdk/launch/_project_spec.py +13 -2
- wandb/sdk/launch/agent/agent.py +103 -59
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
- wandb/sdk/launch/builder/build.py +19 -1
- wandb/sdk/launch/builder/docker_builder.py +5 -1
- wandb/sdk/launch/builder/kaniko_builder.py +5 -1
- wandb/sdk/launch/create_job.py +20 -5
- wandb/sdk/launch/loader.py +14 -5
- wandb/sdk/launch/runner/abstract.py +0 -2
- wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +66 -209
- wandb/sdk/launch/runner/local_container.py +5 -2
- wandb/sdk/launch/runner/local_process.py +4 -1
- wandb/sdk/launch/sweeps/scheduler.py +43 -25
- wandb/sdk/launch/sweeps/utils.py +5 -3
- wandb/sdk/launch/utils.py +3 -1
- wandb/sdk/lib/_settings_toposort_generate.py +3 -9
- wandb/sdk/lib/_settings_toposort_generated.py +27 -3
- wandb/sdk/lib/_wburls_generated.py +1 -0
- wandb/sdk/lib/filenames.py +27 -6
- wandb/sdk/lib/filesystem.py +181 -7
- wandb/sdk/lib/fsm.py +5 -3
- wandb/sdk/lib/gql_request.py +3 -0
- wandb/sdk/lib/ipython.py +7 -0
- wandb/sdk/lib/wburls.py +1 -0
- wandb/sdk/service/port_file.py +2 -15
- wandb/sdk/service/server.py +7 -55
- wandb/sdk/service/service.py +56 -26
- wandb/sdk/service/service_base.py +1 -1
- wandb/sdk/service/streams.py +11 -5
- wandb/sdk/verify/verify.py +2 -2
- wandb/sdk/wandb_init.py +8 -2
- wandb/sdk/wandb_manager.py +4 -14
- wandb/sdk/wandb_run.py +143 -53
- wandb/sdk/wandb_settings.py +148 -35
- wandb/testing/relay.py +85 -38
- wandb/util.py +87 -4
- wandb/wandb_torch.py +24 -38
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/METADATA +48 -23
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/RECORD +107 -103
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/WHEEL +1 -1
- wandb/proto/v3/wandb_server_pb2_grpc.py +0 -1422
- wandb/proto/v4/wandb_server_pb2_grpc.py +0 -1422
- wandb/proto/wandb_server_pb2_grpc.py +0 -8
- wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +0 -61
- wandb/sdk/interface/interface_grpc.py +0 -460
- wandb/sdk/service/server_grpc.py +0 -444
- wandb/sdk/service/service_grpc.py +0 -73
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import pprint
|
|
2
2
|
from typing import Any, Dict, List, Optional
|
3
3
|
|
4
4
|
import wandb
|
5
|
-
|
5
|
+
from wandb.apis import public
|
6
6
|
from wandb.apis.internal import Api
|
7
7
|
from wandb.sdk.launch._project_spec import create_project_from_spec
|
8
8
|
from wandb.sdk.launch.builder.build import build_image_from_project
|
@@ -49,39 +49,42 @@ def launch_add(
|
|
49
49
|
"""Enqueue a W&B launch experiment. With either a source uri, job or docker_image.
|
50
50
|
|
51
51
|
Arguments:
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
52
|
+
uri: URI of experiment to run. A wandb run uri or a Git repository URI.
|
53
|
+
job: string reference to a wandb.Job eg: wandb/test/my-job:latest
|
54
|
+
config: A dictionary containing the configuration for the run. May also contain
|
55
|
+
resource specific arguments under the key "resource_args"
|
56
|
+
project: Target project to send launched run to
|
57
|
+
entity: Target entity to send launched run to
|
58
|
+
queue: the name of the queue to enqueue the run to
|
59
|
+
resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
|
60
|
+
entry_point: Entry point to run within the project. Defaults to using the entry point used
|
61
|
+
in the original run for wandb URIs, or main.py for git repository URIs.
|
62
|
+
name: Name run under which to launch the run.
|
63
|
+
version: For Git-based projects, either a commit hash or a branch name.
|
64
|
+
docker_image: The name of the docker image to use for the run.
|
65
|
+
resource_args: Resource related arguments for launching runs onto a remote backend.
|
66
|
+
Will be stored on the constructed launch config under ``resource_args``.
|
67
|
+
run_id: optional string indicating the id of the launched run
|
68
|
+
build: optional flag defaulting to false, requires queue to be set
|
69
|
+
if build, an image is created, creates a job artifact, pushes a reference
|
70
|
+
to that job artifact to queue
|
71
|
+
repository: optional string to control the name of the remote repository, used when
|
72
|
+
pushing images to a registry
|
73
|
+
project_queue: optional string to control the name of the project for the queue. Primarily used
|
74
|
+
for back compatibility with project scoped queues
|
75
75
|
|
76
76
|
|
77
77
|
Example:
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
78
|
+
```python
|
79
|
+
from wandb.sdk.launch import launch_add
|
80
|
+
|
81
|
+
project_uri = "https://github.com/wandb/examples"
|
82
|
+
params = {"alpha": 0.5, "l1_ratio": 0.01}
|
83
|
+
# Run W&B project and create a reproducible docker environment
|
84
|
+
# on a local host
|
85
|
+
api = wandb.apis.internal.Api()
|
86
|
+
launch_add(uri=project_uri, parameters=params)
|
87
|
+
```
|
85
88
|
|
86
89
|
|
87
90
|
Returns:
|
@@ -106,6 +106,7 @@ class LaunchProject:
|
|
106
106
|
self.override_config: Dict[str, Any] = overrides.get("run_config", {})
|
107
107
|
self.override_artifacts: Dict[str, Any] = overrides.get("artifacts", {})
|
108
108
|
self.override_entrypoint: Optional[EntryPoint] = None
|
109
|
+
self.override_dockerfile: Optional[str] = overrides.get("dockerfile")
|
109
110
|
self.deps_type: Optional[str] = None
|
110
111
|
self._runtime: Optional[str] = None
|
111
112
|
self.run_id = run_id or generate_id()
|
@@ -117,7 +118,8 @@ class LaunchProject:
|
|
117
118
|
if override_entrypoint:
|
118
119
|
_logger.info("Adding override entry point")
|
119
120
|
self.override_entrypoint = EntryPoint(
|
120
|
-
|
121
|
+
name=self._get_entrypoint_file(override_entrypoint),
|
122
|
+
command=override_entrypoint,
|
121
123
|
)
|
122
124
|
|
123
125
|
if overrides.get("sweep_id") is not None:
|
@@ -185,6 +187,15 @@ class LaunchProject:
|
|
185
187
|
assert self.job is not None
|
186
188
|
return wandb.util.make_docker_image_name_safe(self.job.split(":")[0])
|
187
189
|
|
190
|
+
def _get_entrypoint_file(self, entrypoint: List[str]) -> Optional[str]:
|
191
|
+
if not entrypoint:
|
192
|
+
return None
|
193
|
+
if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
|
194
|
+
return entrypoint[0]
|
195
|
+
if len(entrypoint) < 2:
|
196
|
+
return None
|
197
|
+
return entrypoint[1]
|
198
|
+
|
188
199
|
def fill_macros(self, image: str) -> Dict[str, Any]:
|
189
200
|
"""Substitute values for macros in resource arguments.
|
190
201
|
|
@@ -415,7 +426,7 @@ class LaunchProject:
|
|
415
426
|
class EntryPoint:
|
416
427
|
"""An entry point into a wandb launch specification."""
|
417
428
|
|
418
|
-
def __init__(self, name: str, command: List[str]):
|
429
|
+
def __init__(self, name: Optional[str], command: List[str]):
|
419
430
|
self.name = name
|
420
431
|
self.command = command
|
421
432
|
|
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -6,13 +6,12 @@ import threading
|
|
6
6
|
import time
|
7
7
|
import traceback
|
8
8
|
from multiprocessing import Event
|
9
|
-
from multiprocessing.pool import ThreadPool
|
10
9
|
from typing import Any, Dict, List, Optional, Union
|
11
10
|
|
12
11
|
import wandb
|
13
12
|
from wandb.apis.internal import Api
|
14
13
|
from wandb.errors import CommError
|
15
|
-
from wandb.sdk.launch.
|
14
|
+
from wandb.sdk.launch._launch_add import launch_add
|
16
15
|
from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
|
17
16
|
from wandb.sdk.launch.runner.local_process import LocalProcessRunner
|
18
17
|
from wandb.sdk.launch.sweeps.scheduler import Scheduler
|
@@ -35,9 +34,21 @@ AGENT_KILLED = "KILLED"
|
|
35
34
|
|
36
35
|
HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
|
37
36
|
|
38
|
-
MAX_THREADS = 64
|
39
37
|
MAX_RESUME_COUNT = 5
|
40
38
|
|
39
|
+
RUN_INFO_GRACE_PERIOD = 60
|
40
|
+
|
41
|
+
_env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
|
42
|
+
if _env_timeout:
|
43
|
+
try:
|
44
|
+
RUN_START_TIMEOUT = float(_env_timeout)
|
45
|
+
except ValueError:
|
46
|
+
raise LaunchError(
|
47
|
+
f"Invalid value for WANDB_LAUNCH_START_TIMEOUT: {_env_timeout}"
|
48
|
+
)
|
49
|
+
else:
|
50
|
+
RUN_START_TIMEOUT = 60 * 30 # default 30 minutes
|
51
|
+
|
41
52
|
_logger = logging.getLogger(__name__)
|
42
53
|
|
43
54
|
|
@@ -129,13 +140,15 @@ class LaunchAgent:
|
|
129
140
|
self._access = _convert_access("project")
|
130
141
|
self._max_jobs = _max_from_config(config, "max_jobs")
|
131
142
|
self._max_schedulers = _max_from_config(config, "max_schedulers")
|
132
|
-
self._pool = ThreadPool(
|
133
|
-
processes=int(min(MAX_THREADS, self._max_jobs + self._max_schedulers)),
|
134
|
-
initargs=(self._jobs, self._jobs_lock),
|
135
|
-
)
|
136
143
|
self._secure_mode = config.get("secure_mode", False)
|
137
144
|
self.default_config: Dict[str, Any] = config
|
138
145
|
|
146
|
+
# Get agent version from env var if present, otherwise wandb version
|
147
|
+
self.version: str = "wandb@" + wandb.__version__
|
148
|
+
env_agent_version = os.environ.get("WANDB_AGENT_VERSION")
|
149
|
+
if env_agent_version and env_agent_version != "wandb-launch-agent":
|
150
|
+
self.version = env_agent_version
|
151
|
+
|
139
152
|
# serverside creation
|
140
153
|
self.gorilla_supports_agents = (
|
141
154
|
self._api.launch_agent_introspection() is not None
|
@@ -150,6 +163,7 @@ class LaunchAgent:
|
|
150
163
|
self._project,
|
151
164
|
self._queues,
|
152
165
|
self.default_config,
|
166
|
+
self.version,
|
153
167
|
self.gorilla_supports_agents,
|
154
168
|
)
|
155
169
|
self._id = create_response["launchAgentId"]
|
@@ -289,27 +303,43 @@ class LaunchAgent:
|
|
289
303
|
job_and_run_status.err_stage,
|
290
304
|
fnames,
|
291
305
|
)
|
292
|
-
elif job_and_run_status.completed_status not in ["stopped", "failed"]:
|
293
|
-
_logger.info(
|
294
|
-
"Skipping check for completed run status because run was successful"
|
295
|
-
)
|
296
306
|
elif job_and_run_status.run is not None:
|
297
307
|
run_info = None
|
298
|
-
#
|
299
|
-
#
|
300
|
-
#
|
301
|
-
#
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
308
|
+
# We do some weird stuff here getting run info to check for a
|
309
|
+
# created in run in W&B.
|
310
|
+
#
|
311
|
+
# We retry for 60 seconds with an exponential backoff in case
|
312
|
+
# upsert run is taking a while.
|
313
|
+
#
|
314
|
+
# Sweep runs exist but have no info before they are started
|
315
|
+
# so run_info returned will be None, while normal runs just throw a
|
316
|
+
# comm error.
|
317
|
+
start_time = time.time()
|
318
|
+
interval = 1
|
319
|
+
while True:
|
320
|
+
try:
|
321
|
+
run_info = self._api.get_run_info(
|
322
|
+
self._entity,
|
323
|
+
job_and_run_status.project,
|
324
|
+
job_and_run_status.run_id,
|
325
|
+
)
|
326
|
+
except CommError:
|
327
|
+
pass
|
328
|
+
if (
|
329
|
+
run_info is not None
|
330
|
+
or time.time() - start_time > RUN_INFO_GRACE_PERIOD
|
331
|
+
):
|
332
|
+
break
|
333
|
+
if run_info is None:
|
334
|
+
time.sleep(interval)
|
335
|
+
interval *= 2
|
306
336
|
|
307
|
-
except CommError:
|
308
|
-
pass
|
309
337
|
if run_info is None:
|
310
|
-
_msg = "The submitted run was not successfully started"
|
311
338
|
fnames = None
|
312
|
-
|
339
|
+
if job_and_run_status.completed_status == "finished":
|
340
|
+
_msg = "The submitted job exited successfully but failed to call wandb.init"
|
341
|
+
else:
|
342
|
+
_msg = "The submitted run was not successfully started"
|
313
343
|
logs = job_and_run_status.run.get_logs()
|
314
344
|
if logs:
|
315
345
|
fnames = job_and_run_status.saver.save_contents(
|
@@ -319,7 +349,7 @@ class LaunchAgent:
|
|
319
349
|
job_and_run_status.run_queue_item_id, _msg, "run", fnames
|
320
350
|
)
|
321
351
|
else:
|
322
|
-
_logger.info("Finish thread id had no exception
|
352
|
+
_logger.info(f"Finish thread id {thread_id} had no exception and no run")
|
323
353
|
wandb._sentry.exception(
|
324
354
|
"launch agent called finish thread id on thread without run or exception"
|
325
355
|
)
|
@@ -359,19 +389,21 @@ class LaunchAgent:
|
|
359
389
|
|
360
390
|
# Abort if this job attempts to override secure mode
|
361
391
|
self._assert_secure(launch_spec)
|
362
|
-
|
363
|
-
|
364
|
-
self.thread_run_job,
|
365
|
-
(
|
392
|
+
job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
|
393
|
+
t = threading.Thread(
|
394
|
+
target=self.thread_run_job,
|
395
|
+
args=(
|
366
396
|
launch_spec,
|
367
397
|
job,
|
368
398
|
self.default_config,
|
369
399
|
self._api,
|
370
|
-
|
371
|
-
file_saver,
|
400
|
+
job_tracker,
|
372
401
|
),
|
402
|
+
daemon=True,
|
373
403
|
)
|
374
404
|
|
405
|
+
t.start()
|
406
|
+
|
375
407
|
def _assert_secure(self, launch_spec: Dict[str, Any]) -> None:
|
376
408
|
"""If secure mode is set, make sure no vulnerable keys are overridden."""
|
377
409
|
if not self._secure_mode:
|
@@ -422,21 +454,23 @@ class LaunchAgent:
|
|
422
454
|
for queue in self._queues:
|
423
455
|
job = self.pop_from_queue(queue)
|
424
456
|
if job:
|
425
|
-
file_saver = RunQueueItemFileSaver(
|
426
|
-
self._wandb_run, job["runQueueItemId"]
|
427
|
-
)
|
428
|
-
if _is_scheduler_job(job.get("runSpec")):
|
429
|
-
# If job is a scheduler, and we are already at the cap, ignore,
|
430
|
-
# don't ack, and it will be pushed back onto the queue in 1 min
|
431
|
-
if self.num_running_schedulers >= self._max_schedulers:
|
432
|
-
wandb.termwarn(
|
433
|
-
f"{LOG_PREFIX}Agent already running the maximum number "
|
434
|
-
f"of sweep schedulers: {self._max_schedulers}. To set "
|
435
|
-
"this value use `max_schedulers` key in the agent config"
|
436
|
-
)
|
437
|
-
continue
|
438
|
-
|
439
457
|
try:
|
458
|
+
file_saver = RunQueueItemFileSaver(
|
459
|
+
self._wandb_run, job["runQueueItemId"]
|
460
|
+
)
|
461
|
+
if _is_scheduler_job(job.get("runSpec")):
|
462
|
+
# If job is a scheduler, and we are already at the cap, ignore,
|
463
|
+
# don't ack, and it will be pushed back onto the queue in 1 min
|
464
|
+
if (
|
465
|
+
self.num_running_schedulers
|
466
|
+
>= self._max_schedulers
|
467
|
+
):
|
468
|
+
wandb.termwarn(
|
469
|
+
f"{LOG_PREFIX}Agent already running the maximum number "
|
470
|
+
f"of sweep schedulers: {self._max_schedulers}. To set "
|
471
|
+
"this value use `max_schedulers` key in the agent config"
|
472
|
+
)
|
473
|
+
continue
|
440
474
|
self.run_job(job, queue, file_saver)
|
441
475
|
except Exception as e:
|
442
476
|
wandb.termerror(
|
@@ -480,8 +514,6 @@ class LaunchAgent:
|
|
480
514
|
self.update_status(AGENT_KILLED)
|
481
515
|
wandb.termlog(f"{LOG_PREFIX}Shutting down, active jobs:")
|
482
516
|
self.print_status()
|
483
|
-
self._pool.close()
|
484
|
-
self._pool.join()
|
485
517
|
|
486
518
|
# Threaded functions
|
487
519
|
def thread_run_job(
|
@@ -490,15 +522,13 @@ class LaunchAgent:
|
|
490
522
|
job: Dict[str, Any],
|
491
523
|
default_config: Dict[str, Any],
|
492
524
|
api: Api,
|
493
|
-
|
494
|
-
file_saver: RunQueueItemFileSaver,
|
525
|
+
job_tracker: JobAndRunStatusTracker,
|
495
526
|
) -> None:
|
496
527
|
thread_id = threading.current_thread().ident
|
497
|
-
assert thread_id
|
498
|
-
job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
|
499
|
-
with self._jobs_lock:
|
500
|
-
self._jobs[thread_id] = job_tracker
|
528
|
+
assert thread_id
|
501
529
|
try:
|
530
|
+
with self._jobs_lock:
|
531
|
+
self._jobs[thread_id] = job_tracker
|
502
532
|
self._thread_run_job(
|
503
533
|
launch_spec, job, default_config, api, thread_id, job_tracker
|
504
534
|
)
|
@@ -540,7 +570,7 @@ class LaunchAgent:
|
|
540
570
|
_logger.debug(f"Fetch sweep state error: {e}")
|
541
571
|
state = None
|
542
572
|
|
543
|
-
if state
|
573
|
+
if state != "RUNNING" and state != "PAUSED":
|
544
574
|
raise LaunchError(
|
545
575
|
f"Launch agent picked up sweep job, but sweep ({launch_spec['sweep_id']}) was in a terminal state ({state})"
|
546
576
|
)
|
@@ -594,7 +624,18 @@ class LaunchAgent:
|
|
594
624
|
return
|
595
625
|
with self._jobs_lock:
|
596
626
|
job_tracker.run = run
|
627
|
+
start_time = time.time()
|
597
628
|
while self._jobs_event.is_set():
|
629
|
+
# If run has failed to start before timeout, kill it
|
630
|
+
state = run.get_status().state
|
631
|
+
if state == "starting" and RUN_START_TIMEOUT > 0:
|
632
|
+
if time.time() - start_time > RUN_START_TIMEOUT:
|
633
|
+
run.cancel()
|
634
|
+
raise LaunchError(
|
635
|
+
f"Run failed to start within {RUN_START_TIMEOUT} seconds. "
|
636
|
+
"If you want to increase this timeout, set WANDB_LAUNCH_START_TIMEOUT "
|
637
|
+
"to a larger value."
|
638
|
+
)
|
598
639
|
if self._check_run_finished(job_tracker, launch_spec):
|
599
640
|
return
|
600
641
|
time.sleep(AGENT_POLLING_INTERVAL)
|
@@ -655,12 +696,15 @@ class LaunchAgent:
|
|
655
696
|
wandb.termlog(f"{LOG_PREFIX}Scheduler finished with ID: {run.id}")
|
656
697
|
if status == "failed":
|
657
698
|
# on fail, update sweep state. scheduler run_id should == sweep_id
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
699
|
+
try:
|
700
|
+
self._api.set_sweep_state(
|
701
|
+
sweep=job_tracker.run_id,
|
702
|
+
entity=job_tracker.entity,
|
703
|
+
project=job_tracker.project,
|
704
|
+
state="CANCELED",
|
705
|
+
)
|
706
|
+
except Exception as e:
|
707
|
+
raise LaunchError(f"Failed to update sweep state: {e}")
|
664
708
|
else:
|
665
709
|
wandb.termlog(f"{LOG_PREFIX}Job finished with ID: {run.id}")
|
666
710
|
with self._jobs_lock:
|
@@ -5,8 +5,6 @@ import sys
|
|
5
5
|
from typing import List, Optional, Union
|
6
6
|
|
7
7
|
import wandb
|
8
|
-
from wandb.sdk.lib import RunDisabled
|
9
|
-
from wandb.sdk.wandb_run import Run
|
10
8
|
|
11
9
|
if sys.version_info >= (3, 8):
|
12
10
|
from typing import Literal
|
@@ -18,7 +16,11 @@ FileSubtypes = Literal["warning", "error"]
|
|
18
16
|
|
19
17
|
class RunQueueItemFileSaver:
|
20
18
|
def __init__(
|
21
|
-
self,
|
19
|
+
self,
|
20
|
+
agent_run: Optional[
|
21
|
+
Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
|
22
|
+
],
|
23
|
+
run_queue_item_id: str,
|
22
24
|
):
|
23
25
|
self.run_queue_item_id = run_queue_item_id
|
24
26
|
self.run = agent_run
|
@@ -26,7 +28,7 @@ class RunQueueItemFileSaver:
|
|
26
28
|
def save_contents(
|
27
29
|
self, contents: str, fname: str, file_sub_type: FileSubtypes
|
28
30
|
) -> Optional[List[str]]:
|
29
|
-
if not isinstance(self.run, Run):
|
31
|
+
if not isinstance(self.run, wandb.sdk.wandb_run.Run):
|
30
32
|
wandb.termwarn("Not saving file contents because agent has no run")
|
31
33
|
return None
|
32
34
|
root_dir = self.run._settings.files_dir
|
@@ -36,6 +36,7 @@ _logger = logging.getLogger(__name__)
|
|
36
36
|
|
37
37
|
|
38
38
|
_GENERATED_DOCKERFILE_NAME = "Dockerfile.wandb-autogenerated"
|
39
|
+
_DEFAULT_DOCKERFILE_NAME = "Dockerfile.wandb"
|
39
40
|
|
40
41
|
|
41
42
|
def validate_docker_installation() -> None:
|
@@ -237,7 +238,7 @@ def get_env_vars_dict(
|
|
237
238
|
if launch_project.sweep_id:
|
238
239
|
env_vars["WANDB_SWEEP_ID"] = launch_project.sweep_id
|
239
240
|
if launch_project.launch_spec.get("_resume_count", 0) > 0:
|
240
|
-
env_vars["WANDB_RESUME"] = "
|
241
|
+
env_vars["WANDB_RESUME"] = "allow"
|
241
242
|
|
242
243
|
_inject_wandb_config_env_vars(
|
243
244
|
launch_project.override_config, env_vars, max_env_length
|
@@ -321,7 +322,24 @@ def generate_dockerfile(
|
|
321
322
|
entry_point: EntryPoint,
|
322
323
|
runner_type: str,
|
323
324
|
builder_type: str,
|
325
|
+
dockerfile: Optional[str] = None,
|
324
326
|
) -> str:
|
327
|
+
override_entrypoint = launch_project.override_entrypoint or entry_point
|
328
|
+
if launch_project.project_dir is not None:
|
329
|
+
if not dockerfile and override_entrypoint.name is not None:
|
330
|
+
entrypoint_dir = os.path.dirname(override_entrypoint.name)
|
331
|
+
path = os.path.join(
|
332
|
+
launch_project.project_dir, entrypoint_dir, _DEFAULT_DOCKERFILE_NAME
|
333
|
+
)
|
334
|
+
if os.path.exists(path):
|
335
|
+
dockerfile = os.path.join(entrypoint_dir, _DEFAULT_DOCKERFILE_NAME)
|
336
|
+
if dockerfile:
|
337
|
+
path = os.path.join(launch_project.project_dir, dockerfile)
|
338
|
+
if not os.path.exists(path):
|
339
|
+
raise LaunchError(f"Dockerfile does not exist at {path}")
|
340
|
+
wandb.termlog(f"Using dockerfile: {dockerfile}")
|
341
|
+
return open(path).read()
|
342
|
+
|
325
343
|
# get python versions truncated to major.minor to ensure image availability
|
326
344
|
if launch_project.python_version:
|
327
345
|
spl = launch_project.python_version.split(".")[:2]
|
@@ -121,7 +121,11 @@ class DockerBuilder(AbstractBuilder):
|
|
121
121
|
entrypoint (EntryPoint): The entrypoint to use.
|
122
122
|
"""
|
123
123
|
dockerfile_str = generate_dockerfile(
|
124
|
-
launch_project
|
124
|
+
launch_project=launch_project,
|
125
|
+
entry_point=entrypoint,
|
126
|
+
runner_type=launch_project.resource,
|
127
|
+
builder_type="docker",
|
128
|
+
dockerfile=launch_project.override_dockerfile,
|
125
129
|
)
|
126
130
|
|
127
131
|
image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
|
@@ -241,7 +241,11 @@ class KanikoBuilder(AbstractBuilder):
|
|
241
241
|
raise LaunchError("No registry specified for Kaniko build.")
|
242
242
|
# kaniko builder doesn't seem to work with a custom user id, need more investigation
|
243
243
|
dockerfile_str = generate_dockerfile(
|
244
|
-
launch_project
|
244
|
+
launch_project=launch_project,
|
245
|
+
entry_point=entrypoint,
|
246
|
+
runner_type=launch_project.resource,
|
247
|
+
builder_type="kaniko",
|
248
|
+
dockerfile=launch_project.override_dockerfile,
|
245
249
|
)
|
246
250
|
image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
|
247
251
|
repo_uri = self.registry.get_repo_uri()
|
wandb/sdk/launch/create_job.py
CHANGED
@@ -63,7 +63,7 @@ def create_job(
|
|
63
63
|
runtime="3.9",
|
64
64
|
entrypoint="train.py",
|
65
65
|
)
|
66
|
-
# then
|
66
|
+
# then run the newly created job
|
67
67
|
artifact_job.call()
|
68
68
|
```
|
69
69
|
"""
|
@@ -180,7 +180,6 @@ def _create_job(
|
|
180
180
|
run_name=run.id, # run will be deleted after creation
|
181
181
|
description=description,
|
182
182
|
metadata=metadata,
|
183
|
-
labels=["manually-created"],
|
184
183
|
is_user_created=True,
|
185
184
|
aliases=[{"artifactCollectionName": name, "alias": a} for a in aliases],
|
186
185
|
)
|
@@ -335,19 +334,33 @@ def _create_repo_metadata(
|
|
335
334
|
entrypoint = rel_entrypoint
|
336
335
|
|
337
336
|
# check if requirements.txt exists
|
338
|
-
|
339
|
-
|
337
|
+
# start at the location of the python file and recurse up to the git root
|
338
|
+
req_dir = local_dir
|
339
|
+
while (
|
340
|
+
not os.path.exists(os.path.join(req_dir, "requirements.txt"))
|
341
|
+
and req_dir != tempdir
|
342
|
+
):
|
343
|
+
req_dir = os.path.dirname(req_dir)
|
344
|
+
|
345
|
+
if not os.path.exists(os.path.join(req_dir, "requirements.txt")):
|
340
346
|
wandb.termerror(
|
341
|
-
|
347
|
+
"Could not find requirements.txt file in git repo at "
|
348
|
+
f"{os.path.join(os.path.dirname(path), 'requirements.txt')} "
|
349
|
+
"or parent directories."
|
342
350
|
)
|
343
351
|
return None
|
344
352
|
|
353
|
+
wandb.termlog(
|
354
|
+
f"Using requirements.txt in {req_dir.replace(tempdir, '') or 'repository root'}"
|
355
|
+
)
|
356
|
+
|
345
357
|
metadata = {
|
346
358
|
"git": {
|
347
359
|
"commit": commit,
|
348
360
|
"remote": ref.url,
|
349
361
|
},
|
350
362
|
"root": ref.repo,
|
363
|
+
"codePathLocal": entrypoint, # not in git context, optionally also set local
|
351
364
|
"codePath": entrypoint,
|
352
365
|
"entrypoint": [f"python{python_version}", entrypoint],
|
353
366
|
"python": python_version, # used to build container
|
@@ -426,6 +439,8 @@ def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuild
|
|
426
439
|
job_builder = JobBuilder(
|
427
440
|
settings=settings,
|
428
441
|
)
|
442
|
+
# never allow notebook runs
|
443
|
+
job_builder._is_notebook_run = False
|
429
444
|
# set run inputs and outputs to empty dicts
|
430
445
|
job_builder.set_config({})
|
431
446
|
job_builder.set_summary({})
|
wandb/sdk/launch/loader.py
CHANGED
@@ -3,6 +3,7 @@ from typing import Any, Dict, Optional
|
|
3
3
|
|
4
4
|
import wandb
|
5
5
|
from wandb.apis.internal import Api
|
6
|
+
from wandb.docker import is_docker_installed
|
6
7
|
from wandb.sdk.launch.errors import LaunchError
|
7
8
|
|
8
9
|
from .builder.abstract import AbstractBuilder
|
@@ -141,7 +142,10 @@ def builder_from_config(
|
|
141
142
|
This helper function is used to create a builder from a config. The
|
142
143
|
config should have a "type" key that specifies the type of builder to import
|
143
144
|
and create. The remaining keys are passed to the builder's from_config
|
144
|
-
method. If the config is None or empty, a
|
145
|
+
method. If the config is None or empty, a default builder is returned.
|
146
|
+
|
147
|
+
The default builder will be a DockerBuilder if we find a working docker cli
|
148
|
+
on the system, otherwise it will be a NoOpBuilder.
|
145
149
|
|
146
150
|
Arguments:
|
147
151
|
config (Dict[str, Any]): The builder config.
|
@@ -154,11 +158,16 @@ def builder_from_config(
|
|
154
158
|
LaunchError: If the builder is not configured correctly.
|
155
159
|
"""
|
156
160
|
if not config:
|
157
|
-
|
161
|
+
if is_docker_installed():
|
162
|
+
from .builder.docker_builder import DockerBuilder
|
163
|
+
|
164
|
+
return DockerBuilder.from_config(
|
165
|
+
{}, environment, registry
|
166
|
+
) # This is the default builder.
|
167
|
+
|
168
|
+
from .builder.noop import NoOpBuilder
|
158
169
|
|
159
|
-
return
|
160
|
-
{}, environment, registry
|
161
|
-
) # This is the default builder.
|
170
|
+
return NoOpBuilder.from_config({}, environment, registry)
|
162
171
|
|
163
172
|
builder_type = config.get("type")
|
164
173
|
if builder_type is None:
|
@@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Union
|
|
13
13
|
from dockerpycreds.utils import find_executable # type: ignore
|
14
14
|
|
15
15
|
import wandb
|
16
|
-
from wandb import Settings
|
17
16
|
from wandb.apis.internal import Api
|
18
17
|
from wandb.sdk.lib import runid
|
19
18
|
|
@@ -136,7 +135,6 @@ class AbstractRunner(ABC):
|
|
136
135
|
api: Api,
|
137
136
|
backend_config: Dict[str, Any],
|
138
137
|
) -> None:
|
139
|
-
self._settings = Settings()
|
140
138
|
self._api = api
|
141
139
|
self.backend_config = backend_config
|
142
140
|
self._cwd = os.getcwd()
|