wandb 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/public/api.py +6 -6
- wandb/apis/reports/v2/interface.py +4 -8
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +29 -5
- wandb/integration/openai/fine_tuning.py +74 -37
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +92 -26
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
- wandb/sdk/artifacts/artifact_saver.py +16 -36
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +13 -5
- wandb/sdk/interface/interface.py +60 -15
- wandb/sdk/interface/interface_shared.py +13 -7
- wandb/sdk/internal/file_stream.py +19 -0
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +2 -0
- wandb/sdk/internal/job_builder.py +45 -17
- wandb/sdk/internal/sender.py +53 -28
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +4 -1
- wandb/sdk/launch/_launch.py +5 -0
- wandb/sdk/launch/_project_spec.py +5 -20
- wandb/sdk/launch/agent/agent.py +80 -37
- wandb/sdk/launch/agent/config.py +8 -0
- wandb/sdk/launch/builder/kaniko_builder.py +149 -134
- wandb/sdk/launch/create_job.py +44 -48
- wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/sweeps/scheduler.py +3 -1
- wandb/sdk/launch/utils.py +23 -5
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +2 -0
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +78 -0
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/wandb_init.py +12 -7
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +179 -94
- wandb/sdk/wandb_settings.py +55 -16
- wandb/testing/relay.py +5 -6
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/RECORD +55 -54
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/WHEEL +1 -1
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -45,7 +45,10 @@ MAX_RESUME_COUNT = 5
|
|
45
45
|
|
46
46
|
RUN_INFO_GRACE_PERIOD = 60
|
47
47
|
|
48
|
-
|
48
|
+
DEFAULT_STOPPED_RUN_TIMEOUT = 60
|
49
|
+
|
50
|
+
DEFAULT_PRINT_INTERVAL = 5 * 60
|
51
|
+
VERBOSE_PRINT_INTERVAL = 20
|
49
52
|
|
50
53
|
_env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
|
51
54
|
if _env_timeout:
|
@@ -105,30 +108,29 @@ def _max_from_config(
|
|
105
108
|
return max_from_config
|
106
109
|
|
107
110
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
_logger.debug("Recieved runSpec in _is_scheduler_job that was empty")
|
111
|
+
class InternalAgentLogger:
|
112
|
+
def __init__(self, verbosity=0):
|
113
|
+
self._print_to_terminal = verbosity >= 2
|
112
114
|
|
113
|
-
|
114
|
-
|
115
|
+
def error(self, message: str):
|
116
|
+
if self._print_to_terminal:
|
117
|
+
wandb.termerror(f"{LOG_PREFIX}{message}")
|
118
|
+
_logger.error(f"{LOG_PREFIX}{message}")
|
115
119
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
return True
|
120
|
+
def warn(self, message: str):
|
121
|
+
if self._print_to_terminal:
|
122
|
+
wandb.termwarn(f"{LOG_PREFIX}{message}")
|
123
|
+
_logger.warn(f"{LOG_PREFIX}{message}")
|
121
124
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
return False
|
125
|
+
def info(self, message: str):
|
126
|
+
if self._print_to_terminal:
|
127
|
+
wandb.termlog(f"{LOG_PREFIX}{message}")
|
128
|
+
_logger.info(f"{LOG_PREFIX}{message}")
|
127
129
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
130
|
+
def debug(self, message: str):
|
131
|
+
if self._print_to_terminal:
|
132
|
+
wandb.termlog(f"{LOG_PREFIX}{message}")
|
133
|
+
_logger.debug(f"{LOG_PREFIX}{message}")
|
132
134
|
|
133
135
|
|
134
136
|
class LaunchAgent:
|
@@ -184,7 +186,13 @@ class LaunchAgent:
|
|
184
186
|
self._max_jobs = _max_from_config(config, "max_jobs")
|
185
187
|
self._max_schedulers = _max_from_config(config, "max_schedulers")
|
186
188
|
self._secure_mode = config.get("secure_mode", False)
|
189
|
+
self._verbosity = config.get("verbosity", 0)
|
190
|
+
self._internal_logger = InternalAgentLogger(verbosity=self._verbosity)
|
191
|
+
self._last_status_print_time = 0.0
|
187
192
|
self.default_config: Dict[str, Any] = config
|
193
|
+
self._stopped_run_timeout = config.get(
|
194
|
+
"stopped_run_timeout", DEFAULT_STOPPED_RUN_TIMEOUT
|
195
|
+
)
|
188
196
|
|
189
197
|
# Get agent version from env var if present, otherwise wandb version
|
190
198
|
self.version: str = "wandb@" + wandb.__version__
|
@@ -228,6 +236,33 @@ class LaunchAgent:
|
|
228
236
|
self._name = agent_response["name"]
|
229
237
|
self._init_agent_run()
|
230
238
|
|
239
|
+
def _is_scheduler_job(self, run_spec: Dict[str, Any]) -> bool:
|
240
|
+
"""Determine whether a job/runSpec is a sweep scheduler."""
|
241
|
+
if not run_spec:
|
242
|
+
self._internal_logger.debug(
|
243
|
+
"Recieved runSpec in _is_scheduler_job that was empty"
|
244
|
+
)
|
245
|
+
|
246
|
+
if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
|
247
|
+
return False
|
248
|
+
|
249
|
+
if run_spec.get("resource") == "local-process":
|
250
|
+
# Any job pushed to a run queue that has a scheduler uri is
|
251
|
+
# allowed to use local-process
|
252
|
+
if run_spec.get("job"):
|
253
|
+
return True
|
254
|
+
|
255
|
+
# If a scheduler is local-process and run through CLI, also
|
256
|
+
# confirm command is in format: [wandb scheduler <sweep>]
|
257
|
+
cmd = run_spec.get("overrides", {}).get("entry_point", [])
|
258
|
+
if len(cmd) < 3:
|
259
|
+
return False
|
260
|
+
|
261
|
+
if cmd[:2] != ["wandb", "scheduler"]:
|
262
|
+
return False
|
263
|
+
|
264
|
+
return True
|
265
|
+
|
231
266
|
async def fail_run_queue_item(
|
232
267
|
self,
|
233
268
|
run_queue_item_id: str,
|
@@ -298,6 +333,7 @@ class LaunchAgent:
|
|
298
333
|
|
299
334
|
def print_status(self) -> None:
|
300
335
|
"""Prints the current status of the agent."""
|
336
|
+
self._last_status_print_time = time.time()
|
301
337
|
output_str = "agent "
|
302
338
|
if self._name:
|
303
339
|
output_str += f"{self._name} "
|
@@ -344,8 +380,8 @@ class LaunchAgent:
|
|
344
380
|
if run_state.lower() != "pending":
|
345
381
|
return True
|
346
382
|
except CommError:
|
347
|
-
|
348
|
-
f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run"
|
383
|
+
self._internal_logger.info(
|
384
|
+
f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run",
|
349
385
|
)
|
350
386
|
return False
|
351
387
|
|
@@ -361,8 +397,8 @@ class LaunchAgent:
|
|
361
397
|
job_and_run_status.entity is not None
|
362
398
|
and job_and_run_status.entity != self._entity
|
363
399
|
):
|
364
|
-
|
365
|
-
"Skipping check for completed run status because run is on a different entity than agent"
|
400
|
+
self._internal_logger.info(
|
401
|
+
"Skipping check for completed run status because run is on a different entity than agent",
|
366
402
|
)
|
367
403
|
elif exception is not None:
|
368
404
|
tb_str = traceback.format_exception(
|
@@ -378,8 +414,8 @@ class LaunchAgent:
|
|
378
414
|
fnames,
|
379
415
|
)
|
380
416
|
elif job_and_run_status.project is None or job_and_run_status.run_id is None:
|
381
|
-
|
382
|
-
f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}"
|
417
|
+
self._internal_logger.info(
|
418
|
+
f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}",
|
383
419
|
)
|
384
420
|
wandb.termerror(
|
385
421
|
"Missing project or run id on thread called finish thread id"
|
@@ -430,7 +466,9 @@ class LaunchAgent:
|
|
430
466
|
job_and_run_status.run_queue_item_id, _msg, "run", fnames
|
431
467
|
)
|
432
468
|
else:
|
433
|
-
|
469
|
+
self._internal_logger.info(
|
470
|
+
f"Finish thread id {thread_id} had no exception and no run"
|
471
|
+
)
|
434
472
|
wandb._sentry.exception(
|
435
473
|
"launch agent called finish thread id on thread without run or exception"
|
436
474
|
)
|
@@ -458,7 +496,7 @@ class LaunchAgent:
|
|
458
496
|
await self.update_status(AGENT_RUNNING)
|
459
497
|
|
460
498
|
# parse job
|
461
|
-
|
499
|
+
self._internal_logger.info("Parsing launch spec")
|
462
500
|
launch_spec = job["runSpec"]
|
463
501
|
|
464
502
|
# Abort if this job attempts to override secure mode
|
@@ -511,6 +549,10 @@ class LaunchAgent:
|
|
511
549
|
KeyboardInterrupt: if the agent is requested to stop.
|
512
550
|
"""
|
513
551
|
self.print_status()
|
552
|
+
if self._verbosity == 0:
|
553
|
+
print_interval = DEFAULT_PRINT_INTERVAL
|
554
|
+
else:
|
555
|
+
print_interval = VERBOSE_PRINT_INTERVAL
|
514
556
|
try:
|
515
557
|
while True:
|
516
558
|
job = None
|
@@ -532,7 +574,7 @@ class LaunchAgent:
|
|
532
574
|
file_saver = RunQueueItemFileSaver(
|
533
575
|
self._wandb_run, job["runQueueItemId"]
|
534
576
|
)
|
535
|
-
if _is_scheduler_job(job.get("runSpec", {})):
|
577
|
+
if self._is_scheduler_job(job.get("runSpec", {})):
|
536
578
|
# If job is a scheduler, and we are already at the cap, ignore,
|
537
579
|
# don't ack, and it will be pushed back onto the queue in 1 min
|
538
580
|
if self.num_running_schedulers >= self._max_schedulers:
|
@@ -567,6 +609,7 @@ class LaunchAgent:
|
|
567
609
|
await self.update_status(AGENT_POLLING)
|
568
610
|
else:
|
569
611
|
await self.update_status(AGENT_RUNNING)
|
612
|
+
if time.time() - self._last_status_print_time > print_interval:
|
570
613
|
self.print_status()
|
571
614
|
|
572
615
|
if self.num_running_jobs == self._max_jobs or job is None:
|
@@ -634,14 +677,14 @@ class LaunchAgent:
|
|
634
677
|
await self.check_sweep_state(launch_spec, api)
|
635
678
|
|
636
679
|
job_tracker.update_run_info(project)
|
637
|
-
|
680
|
+
self._internal_logger.info("Fetching and validating project...")
|
638
681
|
project.fetch_and_validate_project()
|
639
|
-
|
682
|
+
self._internal_logger.info("Fetching resource...")
|
640
683
|
resource = launch_spec.get("resource") or "local-container"
|
641
684
|
backend_config: Dict[str, Any] = {
|
642
685
|
PROJECT_SYNCHRONOUS: False, # agent always runs async
|
643
686
|
}
|
644
|
-
|
687
|
+
self._internal_logger.info("Loading backend")
|
645
688
|
override_build_config = launch_spec.get("builder")
|
646
689
|
|
647
690
|
_, build_config, registry_config = construct_agent_configs(
|
@@ -661,13 +704,13 @@ class LaunchAgent:
|
|
661
704
|
assert entrypoint is not None
|
662
705
|
image_uri = await builder.build_image(project, entrypoint, job_tracker)
|
663
706
|
|
664
|
-
|
707
|
+
self._internal_logger.info("Backend loaded...")
|
665
708
|
if isinstance(backend, LocalProcessRunner):
|
666
709
|
run = await backend.run(project, image_uri)
|
667
710
|
else:
|
668
711
|
assert image_uri
|
669
712
|
run = await backend.run(project, image_uri)
|
670
|
-
if _is_scheduler_job(launch_spec):
|
713
|
+
if self._is_scheduler_job(launch_spec):
|
671
714
|
with self._jobs_lock:
|
672
715
|
self._jobs[thread_id].is_scheduler = True
|
673
716
|
wandb.termlog(
|
@@ -700,7 +743,7 @@ class LaunchAgent:
|
|
700
743
|
if stopped_time is None:
|
701
744
|
stopped_time = time.time()
|
702
745
|
else:
|
703
|
-
if time.time() - stopped_time >
|
746
|
+
if time.time() - stopped_time > self._stopped_run_timeout:
|
704
747
|
await run.cancel()
|
705
748
|
await asyncio.sleep(AGENT_POLLING_INTERVAL)
|
706
749
|
|
@@ -720,7 +763,7 @@ class LaunchAgent:
|
|
720
763
|
project=launch_spec["project"],
|
721
764
|
)
|
722
765
|
except Exception as e:
|
723
|
-
|
766
|
+
self._internal_logger.debug(f"Fetch sweep state error: {e}")
|
724
767
|
state = None
|
725
768
|
|
726
769
|
if state != "RUNNING" and state != "PAUSED":
|
wandb/sdk/launch/agent/config.py
CHANGED
@@ -225,6 +225,14 @@ class AgentConfig(BaseModel):
|
|
225
225
|
None,
|
226
226
|
description="The builder to use.",
|
227
227
|
)
|
228
|
+
verbosity: Optional[int] = Field(
|
229
|
+
0,
|
230
|
+
description="How verbose to print, 0 = default, 1 = verbose, 2 = very verbose",
|
231
|
+
)
|
232
|
+
stopped_run_timeout: Optional[int] = Field(
|
233
|
+
60,
|
234
|
+
description="How many seconds to wait after receiving the stop command before forcibly cancelling a run.",
|
235
|
+
)
|
228
236
|
|
229
237
|
class Config:
|
230
238
|
extra = "forbid"
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
import copy
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import os
|
@@ -8,7 +9,7 @@ import tarfile
|
|
8
9
|
import tempfile
|
9
10
|
import time
|
10
11
|
import traceback
|
11
|
-
from typing import Optional
|
12
|
+
from typing import Any, Dict, Optional
|
12
13
|
|
13
14
|
import wandb
|
14
15
|
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
@@ -105,6 +106,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
105
106
|
secret_name: str = "",
|
106
107
|
secret_key: str = "",
|
107
108
|
image: str = "gcr.io/kaniko-project/executor:v1.11.0",
|
109
|
+
config: Optional[dict] = None,
|
108
110
|
):
|
109
111
|
"""Initialize a KanikoBuilder.
|
110
112
|
|
@@ -125,6 +127,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
125
127
|
self.secret_name = secret_name
|
126
128
|
self.secret_key = secret_key
|
127
129
|
self.image = image
|
130
|
+
self.kaniko_config = config or {}
|
128
131
|
|
129
132
|
@classmethod
|
130
133
|
def from_config(
|
@@ -170,6 +173,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
170
173
|
image_uri = config.get("destination")
|
171
174
|
if image_uri is not None:
|
172
175
|
registry = registry_from_uri(image_uri)
|
176
|
+
kaniko_config = config.get("kaniko-config", {})
|
173
177
|
|
174
178
|
return cls(
|
175
179
|
environment,
|
@@ -179,6 +183,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
179
183
|
secret_name=secret_name,
|
180
184
|
secret_key=secret_key,
|
181
185
|
image=kaniko_image,
|
186
|
+
config=kaniko_config,
|
182
187
|
)
|
183
188
|
|
184
189
|
async def verify(self) -> None:
|
@@ -289,7 +294,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
289
294
|
|
290
295
|
build_context = await self._upload_build_context(run_id, context_path)
|
291
296
|
build_job = await self._create_kaniko_job(
|
292
|
-
build_job_name, repo_uri, image_uri, build_context, core_v1
|
297
|
+
build_job_name, repo_uri, image_uri, build_context, core_v1, api_client
|
293
298
|
)
|
294
299
|
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
295
300
|
|
@@ -324,7 +329,9 @@ class KanikoBuilder(AbstractBuilder):
|
|
324
329
|
):
|
325
330
|
if job_tracker:
|
326
331
|
job_tracker.set_err_stage("build")
|
327
|
-
raise Exception(
|
332
|
+
raise Exception(
|
333
|
+
f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
|
334
|
+
)
|
328
335
|
try:
|
329
336
|
pods_from_job = await core_v1.list_namespaced_pod(
|
330
337
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
@@ -371,23 +378,32 @@ class KanikoBuilder(AbstractBuilder):
|
|
371
378
|
image_tag: str,
|
372
379
|
build_context_path: str,
|
373
380
|
core_client: client.CoreV1Api,
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
381
|
+
api_client,
|
382
|
+
) -> Dict[str, Any]:
|
383
|
+
job = copy.deepcopy(self.kaniko_config)
|
384
|
+
job_metadata = job.get("metadata", {})
|
385
|
+
job_labels = job_metadata.get("labels", {})
|
386
|
+
job_spec = job.get("spec", {})
|
387
|
+
pod_template = job_spec.get("template", {})
|
388
|
+
pod_metadata = pod_template.get("metadata", {})
|
389
|
+
pod_labels = pod_metadata.get("labels", {})
|
390
|
+
pod_spec = pod_template.get("spec", {})
|
391
|
+
volumes = pod_spec.get("volumes", [])
|
392
|
+
containers = pod_spec.get("containers") or [{}]
|
393
|
+
if len(containers) > 1:
|
394
|
+
raise LaunchError(
|
395
|
+
"Multiple container configs not supported for kaniko builder."
|
396
|
+
)
|
397
|
+
container = containers[0]
|
398
|
+
volume_mounts = container.get("volumeMounts", [])
|
399
|
+
env = container.get("env", [])
|
400
|
+
custom_args = container.get("args", [])
|
378
401
|
|
379
402
|
if PVC_MOUNT_PATH:
|
380
403
|
volumes.append(
|
381
|
-
|
382
|
-
name="kaniko-pvc",
|
383
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
384
|
-
claim_name=PVC_NAME
|
385
|
-
),
|
386
|
-
)
|
387
|
-
)
|
388
|
-
volume_mounts.append(
|
389
|
-
client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
|
404
|
+
{"name": "kaniko-pvc", "persistentVolumeClaim": {"claimName": PVC_NAME}}
|
390
405
|
)
|
406
|
+
volume_mounts.append({"name": "kaniko-pvc", "mountPath": "/context"})
|
391
407
|
|
392
408
|
if bool(self.secret_name) != bool(self.secret_key):
|
393
409
|
raise LaunchError(
|
@@ -395,13 +411,13 @@ class KanikoBuilder(AbstractBuilder):
|
|
395
411
|
"for kaniko build. You provided only one of them."
|
396
412
|
)
|
397
413
|
if isinstance(self.registry, ElasticContainerRegistry):
|
398
|
-
env
|
399
|
-
|
400
|
-
name
|
401
|
-
value
|
402
|
-
|
403
|
-
|
404
|
-
# TODO: Refactor all of this environment/registry
|
414
|
+
env.append(
|
415
|
+
{
|
416
|
+
"name": "AWS_REGION",
|
417
|
+
"value": self.registry.region,
|
418
|
+
}
|
419
|
+
)
|
420
|
+
# TODO(ben): Refactor all of this environment/registry
|
405
421
|
# specific stuff into methods of those classes.
|
406
422
|
if isinstance(self.environment, AzureEnvironment):
|
407
423
|
# Use the core api to check if the secret exists
|
@@ -416,52 +432,46 @@ class KanikoBuilder(AbstractBuilder):
|
|
416
432
|
"namespace wandb. Please create it with the key password "
|
417
433
|
"set to your azure storage access key."
|
418
434
|
) from e
|
419
|
-
env
|
420
|
-
|
421
|
-
name
|
422
|
-
|
423
|
-
|
424
|
-
name
|
425
|
-
key
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
435
|
+
env.append(
|
436
|
+
{
|
437
|
+
"name": "AZURE_STORAGE_ACCESS_KEY",
|
438
|
+
"valueFrom": {
|
439
|
+
"secretKeyRef": {
|
440
|
+
"name": "azure-storage-access-key",
|
441
|
+
"key": "password",
|
442
|
+
}
|
443
|
+
},
|
444
|
+
}
|
445
|
+
)
|
430
446
|
if DOCKER_CONFIG_SECRET:
|
431
447
|
volumes.append(
|
432
|
-
|
433
|
-
name
|
434
|
-
secret
|
435
|
-
|
436
|
-
items
|
437
|
-
|
438
|
-
key
|
439
|
-
|
448
|
+
{
|
449
|
+
"name": "kaniko-docker-config",
|
450
|
+
"secret": {
|
451
|
+
"secretName": DOCKER_CONFIG_SECRET,
|
452
|
+
"items": [
|
453
|
+
{
|
454
|
+
"key": ".dockerconfigjson",
|
455
|
+
"path": "config.json",
|
456
|
+
}
|
440
457
|
],
|
441
|
-
|
442
|
-
|
458
|
+
},
|
459
|
+
}
|
443
460
|
)
|
444
461
|
volume_mounts.append(
|
445
|
-
|
446
|
-
name="kaniko-docker-config",
|
447
|
-
mount_path="/kaniko/.docker",
|
448
|
-
)
|
462
|
+
{"name": "kaniko-docker-config", "mountPath": "/kaniko/.docker"}
|
449
463
|
)
|
450
464
|
elif self.secret_name and self.secret_key:
|
451
|
-
volumes
|
452
|
-
|
453
|
-
name
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
name="docker-config", mount_path="/kaniko/.docker/"
|
462
|
-
),
|
463
|
-
]
|
464
|
-
# TODO: I don't like conditioning on the registry type here. As a
|
465
|
+
volumes.append(
|
466
|
+
{
|
467
|
+
"name": "docker-config",
|
468
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
469
|
+
}
|
470
|
+
)
|
471
|
+
volume_mounts.append(
|
472
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker"}
|
473
|
+
)
|
474
|
+
# TODO(ben): I don't like conditioning on the registry type here. As a
|
465
475
|
# future change I want the registry and environment classes to provide
|
466
476
|
# a list of environment variables and volume mounts that need to be
|
467
477
|
# added to the job. The environment class provides credentials for
|
@@ -475,90 +485,95 @@ class KanikoBuilder(AbstractBuilder):
|
|
475
485
|
elif isinstance(self.registry, GoogleArtifactRegistry):
|
476
486
|
mount_path = "/kaniko/.config/gcloud"
|
477
487
|
key = "config.json"
|
478
|
-
env
|
479
|
-
|
480
|
-
name
|
481
|
-
value
|
482
|
-
|
483
|
-
|
488
|
+
env.append(
|
489
|
+
{
|
490
|
+
"name": "GOOGLE_APPLICATION_CREDENTIALS",
|
491
|
+
"value": "/kaniko/.config/gcloud/config.json",
|
492
|
+
}
|
493
|
+
)
|
484
494
|
else:
|
485
495
|
raise LaunchError(
|
486
496
|
f"Registry type {type(self.registry)} not supported by kaniko"
|
487
497
|
)
|
488
|
-
|
489
|
-
|
490
|
-
name
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
498
|
+
volumes.append(
|
499
|
+
{
|
500
|
+
"name": self.secret_name,
|
501
|
+
"secret": {
|
502
|
+
"secretName": self.secret_name,
|
503
|
+
"items": [{"key": self.secret_key, "path": key}],
|
504
|
+
},
|
505
|
+
}
|
506
|
+
)
|
507
|
+
volume_mounts.append(
|
508
|
+
{
|
509
|
+
"name": self.secret_name,
|
510
|
+
"mountPath": mount_path,
|
511
|
+
"readOnly": True,
|
512
|
+
}
|
513
|
+
)
|
504
514
|
if isinstance(self.registry, AzureContainerRegistry):
|
505
|
-
#
|
506
|
-
|
507
|
-
|
508
|
-
name
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
name=f"docker-config-{job_name}",
|
516
|
-
),
|
517
|
-
),
|
518
|
-
]
|
515
|
+
# Add the docker config map
|
516
|
+
volumes.append(
|
517
|
+
{
|
518
|
+
"name": "docker-config",
|
519
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
520
|
+
}
|
521
|
+
)
|
522
|
+
volume_mounts.append(
|
523
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker/"}
|
524
|
+
)
|
519
525
|
# Kaniko doesn't want https:// at the begining of the image tag.
|
520
526
|
destination = image_tag
|
521
527
|
if destination.startswith("https://"):
|
522
528
|
destination = destination.replace("https://", "")
|
523
|
-
args =
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
"--cache
|
528
|
-
|
529
|
-
"--
|
530
|
-
"--compressed-caching
|
529
|
+
args = {
|
530
|
+
"--context": build_context_path,
|
531
|
+
"--dockerfile": _WANDB_DOCKERFILE_NAME,
|
532
|
+
"--destination": destination,
|
533
|
+
"--cache": "true",
|
534
|
+
"--cache-repo": repository.replace("https://", ""),
|
535
|
+
"--snapshot-mode": "redo",
|
536
|
+
"--compressed-caching": "false",
|
537
|
+
}
|
538
|
+
for custom_arg in custom_args:
|
539
|
+
arg_name, arg_value = custom_arg.split("=", 1)
|
540
|
+
args[arg_name] = arg_value
|
541
|
+
parsed_args = [
|
542
|
+
f"{arg_name}={arg_value}" for arg_name, arg_value in args.items()
|
531
543
|
]
|
532
|
-
container =
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
volume_mounts=volume_mounts,
|
537
|
-
env=env if env else None,
|
538
|
-
)
|
539
|
-
# Create and configure a spec section
|
540
|
-
labels = {"wandb": "launch"}
|
544
|
+
container["args"] = parsed_args
|
545
|
+
|
546
|
+
# Apply the rest of our defaults
|
547
|
+
pod_labels["wandb"] = "launch"
|
541
548
|
# This annotation is required to enable azure workload identity.
|
542
549
|
if isinstance(self.registry, AzureContainerRegistry):
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
restart_policy="Never",
|
548
|
-
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
549
|
-
containers=[container],
|
550
|
-
volumes=volumes,
|
551
|
-
service_account_name=SERVICE_ACCOUNT_NAME,
|
552
|
-
),
|
550
|
+
pod_labels["azure.workload.identity/use"] = "true"
|
551
|
+
pod_spec["restartPolicy"] = pod_spec.get("restartPolicy", "Never")
|
552
|
+
pod_spec["activeDeadlineSeconds"] = pod_spec.get(
|
553
|
+
"activeDeadlineSeconds", _DEFAULT_BUILD_TIMEOUT_SECS
|
553
554
|
)
|
554
|
-
|
555
|
-
|
556
|
-
job = client.V1Job(
|
557
|
-
api_version="batch/v1",
|
558
|
-
kind="Job",
|
559
|
-
metadata=client.V1ObjectMeta(
|
560
|
-
name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
|
561
|
-
),
|
562
|
-
spec=spec,
|
555
|
+
pod_spec["serviceAccountName"] = pod_spec.get(
|
556
|
+
"serviceAccountName", SERVICE_ACCOUNT_NAME
|
563
557
|
)
|
558
|
+
job_spec["backoffLimit"] = job_spec.get("backoffLimit", 0)
|
559
|
+
job_labels["wandb"] = "launch"
|
560
|
+
job_metadata["namespace"] = job_metadata.get("namespace", NAMESPACE)
|
561
|
+
job_metadata["name"] = job_metadata.get("name", job_name)
|
562
|
+
job["apiVersion"] = "batch/v1"
|
563
|
+
job["kind"] = "Job"
|
564
|
+
|
565
|
+
# Apply all nested configs from the bottom up
|
566
|
+
pod_metadata["labels"] = pod_labels
|
567
|
+
pod_template["metadata"] = pod_metadata
|
568
|
+
container["name"] = container.get("name", "wandb-container-build")
|
569
|
+
container["image"] = container.get("image", self.image)
|
570
|
+
container["volumeMounts"] = volume_mounts
|
571
|
+
container["env"] = env
|
572
|
+
pod_spec["containers"] = [container]
|
573
|
+
pod_spec["volumes"] = volumes
|
574
|
+
pod_template["spec"] = pod_spec
|
575
|
+
job_spec["template"] = pod_template
|
576
|
+
job_metadata["labels"] = job_labels
|
577
|
+
job["metadata"] = job_metadata
|
578
|
+
job["spec"] = job_spec
|
564
579
|
return job
|