wandb 0.16.5__py3-none-any.whl → 0.16.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +1 -1
- wandb/cli/cli.py +5 -2
- wandb/integration/openai/fine_tuning.py +74 -37
- wandb/proto/v3/wandb_internal_pb2.py +192 -192
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_internal_pb2.py +192 -192
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/sdk/artifacts/artifact.py +25 -9
- wandb/sdk/artifacts/artifact_saver.py +16 -19
- wandb/sdk/interface/interface.py +18 -6
- wandb/sdk/launch/_launch.py +5 -0
- wandb/sdk/launch/_project_spec.py +5 -20
- wandb/sdk/launch/agent/agent.py +80 -37
- wandb/sdk/launch/agent/config.py +8 -0
- wandb/sdk/launch/builder/kaniko_builder.py +149 -134
- wandb/sdk/launch/create_job.py +43 -48
- wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
- wandb/sdk/launch/sweeps/scheduler.py +3 -1
- wandb/sdk/launch/utils.py +18 -0
- wandb/sdk/lib/_settings_toposort_generated.py +1 -0
- wandb/sdk/lib/run_moment.py +7 -1
- wandb/sdk/wandb_init.py +2 -8
- wandb/sdk/wandb_run.py +50 -34
- wandb/sdk/wandb_settings.py +2 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/RECORD +30 -30
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/WHEEL +0 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
import copy
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import os
|
@@ -8,7 +9,7 @@ import tarfile
|
|
8
9
|
import tempfile
|
9
10
|
import time
|
10
11
|
import traceback
|
11
|
-
from typing import Optional
|
12
|
+
from typing import Any, Dict, Optional
|
12
13
|
|
13
14
|
import wandb
|
14
15
|
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
@@ -105,6 +106,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
105
106
|
secret_name: str = "",
|
106
107
|
secret_key: str = "",
|
107
108
|
image: str = "gcr.io/kaniko-project/executor:v1.11.0",
|
109
|
+
config: Optional[dict] = None,
|
108
110
|
):
|
109
111
|
"""Initialize a KanikoBuilder.
|
110
112
|
|
@@ -125,6 +127,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
125
127
|
self.secret_name = secret_name
|
126
128
|
self.secret_key = secret_key
|
127
129
|
self.image = image
|
130
|
+
self.kaniko_config = config or {}
|
128
131
|
|
129
132
|
@classmethod
|
130
133
|
def from_config(
|
@@ -170,6 +173,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
170
173
|
image_uri = config.get("destination")
|
171
174
|
if image_uri is not None:
|
172
175
|
registry = registry_from_uri(image_uri)
|
176
|
+
kaniko_config = config.get("kaniko-config", {})
|
173
177
|
|
174
178
|
return cls(
|
175
179
|
environment,
|
@@ -179,6 +183,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
179
183
|
secret_name=secret_name,
|
180
184
|
secret_key=secret_key,
|
181
185
|
image=kaniko_image,
|
186
|
+
config=kaniko_config,
|
182
187
|
)
|
183
188
|
|
184
189
|
async def verify(self) -> None:
|
@@ -289,7 +294,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
289
294
|
|
290
295
|
build_context = await self._upload_build_context(run_id, context_path)
|
291
296
|
build_job = await self._create_kaniko_job(
|
292
|
-
build_job_name, repo_uri, image_uri, build_context, core_v1
|
297
|
+
build_job_name, repo_uri, image_uri, build_context, core_v1, api_client
|
293
298
|
)
|
294
299
|
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
295
300
|
|
@@ -324,7 +329,9 @@ class KanikoBuilder(AbstractBuilder):
|
|
324
329
|
):
|
325
330
|
if job_tracker:
|
326
331
|
job_tracker.set_err_stage("build")
|
327
|
-
raise Exception(
|
332
|
+
raise Exception(
|
333
|
+
f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
|
334
|
+
)
|
328
335
|
try:
|
329
336
|
pods_from_job = await core_v1.list_namespaced_pod(
|
330
337
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
@@ -371,23 +378,32 @@ class KanikoBuilder(AbstractBuilder):
|
|
371
378
|
image_tag: str,
|
372
379
|
build_context_path: str,
|
373
380
|
core_client: client.CoreV1Api,
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
381
|
+
api_client,
|
382
|
+
) -> Dict[str, Any]:
|
383
|
+
job = copy.deepcopy(self.kaniko_config)
|
384
|
+
job_metadata = job.get("metadata", {})
|
385
|
+
job_labels = job_metadata.get("labels", {})
|
386
|
+
job_spec = job.get("spec", {})
|
387
|
+
pod_template = job_spec.get("template", {})
|
388
|
+
pod_metadata = pod_template.get("metadata", {})
|
389
|
+
pod_labels = pod_metadata.get("labels", {})
|
390
|
+
pod_spec = pod_template.get("spec", {})
|
391
|
+
volumes = pod_spec.get("volumes", [])
|
392
|
+
containers = pod_spec.get("containers") or [{}]
|
393
|
+
if len(containers) > 1:
|
394
|
+
raise LaunchError(
|
395
|
+
"Multiple container configs not supported for kaniko builder."
|
396
|
+
)
|
397
|
+
container = containers[0]
|
398
|
+
volume_mounts = container.get("volumeMounts", [])
|
399
|
+
env = container.get("env", [])
|
400
|
+
custom_args = container.get("args", [])
|
378
401
|
|
379
402
|
if PVC_MOUNT_PATH:
|
380
403
|
volumes.append(
|
381
|
-
|
382
|
-
name="kaniko-pvc",
|
383
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
384
|
-
claim_name=PVC_NAME
|
385
|
-
),
|
386
|
-
)
|
387
|
-
)
|
388
|
-
volume_mounts.append(
|
389
|
-
client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
|
404
|
+
{"name": "kaniko-pvc", "persistentVolumeClaim": {"claimName": PVC_NAME}}
|
390
405
|
)
|
406
|
+
volume_mounts.append({"name": "kaniko-pvc", "mountPath": "/context"})
|
391
407
|
|
392
408
|
if bool(self.secret_name) != bool(self.secret_key):
|
393
409
|
raise LaunchError(
|
@@ -395,13 +411,13 @@ class KanikoBuilder(AbstractBuilder):
|
|
395
411
|
"for kaniko build. You provided only one of them."
|
396
412
|
)
|
397
413
|
if isinstance(self.registry, ElasticContainerRegistry):
|
398
|
-
env
|
399
|
-
|
400
|
-
name
|
401
|
-
value
|
402
|
-
|
403
|
-
|
404
|
-
# TODO: Refactor all of this environment/registry
|
414
|
+
env.append(
|
415
|
+
{
|
416
|
+
"name": "AWS_REGION",
|
417
|
+
"value": self.registry.region,
|
418
|
+
}
|
419
|
+
)
|
420
|
+
# TODO(ben): Refactor all of this environment/registry
|
405
421
|
# specific stuff into methods of those classes.
|
406
422
|
if isinstance(self.environment, AzureEnvironment):
|
407
423
|
# Use the core api to check if the secret exists
|
@@ -416,52 +432,46 @@ class KanikoBuilder(AbstractBuilder):
|
|
416
432
|
"namespace wandb. Please create it with the key password "
|
417
433
|
"set to your azure storage access key."
|
418
434
|
) from e
|
419
|
-
env
|
420
|
-
|
421
|
-
name
|
422
|
-
|
423
|
-
|
424
|
-
name
|
425
|
-
key
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
435
|
+
env.append(
|
436
|
+
{
|
437
|
+
"name": "AZURE_STORAGE_ACCESS_KEY",
|
438
|
+
"valueFrom": {
|
439
|
+
"secretKeyRef": {
|
440
|
+
"name": "azure-storage-access-key",
|
441
|
+
"key": "password",
|
442
|
+
}
|
443
|
+
},
|
444
|
+
}
|
445
|
+
)
|
430
446
|
if DOCKER_CONFIG_SECRET:
|
431
447
|
volumes.append(
|
432
|
-
|
433
|
-
name
|
434
|
-
secret
|
435
|
-
|
436
|
-
items
|
437
|
-
|
438
|
-
key
|
439
|
-
|
448
|
+
{
|
449
|
+
"name": "kaniko-docker-config",
|
450
|
+
"secret": {
|
451
|
+
"secretName": DOCKER_CONFIG_SECRET,
|
452
|
+
"items": [
|
453
|
+
{
|
454
|
+
"key": ".dockerconfigjson",
|
455
|
+
"path": "config.json",
|
456
|
+
}
|
440
457
|
],
|
441
|
-
|
442
|
-
|
458
|
+
},
|
459
|
+
}
|
443
460
|
)
|
444
461
|
volume_mounts.append(
|
445
|
-
|
446
|
-
name="kaniko-docker-config",
|
447
|
-
mount_path="/kaniko/.docker",
|
448
|
-
)
|
462
|
+
{"name": "kaniko-docker-config", "mountPath": "/kaniko/.docker"}
|
449
463
|
)
|
450
464
|
elif self.secret_name and self.secret_key:
|
451
|
-
volumes
|
452
|
-
|
453
|
-
name
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
name="docker-config", mount_path="/kaniko/.docker/"
|
462
|
-
),
|
463
|
-
]
|
464
|
-
# TODO: I don't like conditioning on the registry type here. As a
|
465
|
+
volumes.append(
|
466
|
+
{
|
467
|
+
"name": "docker-config",
|
468
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
469
|
+
}
|
470
|
+
)
|
471
|
+
volume_mounts.append(
|
472
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker"}
|
473
|
+
)
|
474
|
+
# TODO(ben): I don't like conditioning on the registry type here. As a
|
465
475
|
# future change I want the registry and environment classes to provide
|
466
476
|
# a list of environment variables and volume mounts that need to be
|
467
477
|
# added to the job. The environment class provides credentials for
|
@@ -475,90 +485,95 @@ class KanikoBuilder(AbstractBuilder):
|
|
475
485
|
elif isinstance(self.registry, GoogleArtifactRegistry):
|
476
486
|
mount_path = "/kaniko/.config/gcloud"
|
477
487
|
key = "config.json"
|
478
|
-
env
|
479
|
-
|
480
|
-
name
|
481
|
-
value
|
482
|
-
|
483
|
-
|
488
|
+
env.append(
|
489
|
+
{
|
490
|
+
"name": "GOOGLE_APPLICATION_CREDENTIALS",
|
491
|
+
"value": "/kaniko/.config/gcloud/config.json",
|
492
|
+
}
|
493
|
+
)
|
484
494
|
else:
|
485
495
|
raise LaunchError(
|
486
496
|
f"Registry type {type(self.registry)} not supported by kaniko"
|
487
497
|
)
|
488
|
-
|
489
|
-
|
490
|
-
name
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
498
|
+
volumes.append(
|
499
|
+
{
|
500
|
+
"name": self.secret_name,
|
501
|
+
"secret": {
|
502
|
+
"secretName": self.secret_name,
|
503
|
+
"items": [{"key": self.secret_key, "path": key}],
|
504
|
+
},
|
505
|
+
}
|
506
|
+
)
|
507
|
+
volume_mounts.append(
|
508
|
+
{
|
509
|
+
"name": self.secret_name,
|
510
|
+
"mountPath": mount_path,
|
511
|
+
"readOnly": True,
|
512
|
+
}
|
513
|
+
)
|
504
514
|
if isinstance(self.registry, AzureContainerRegistry):
|
505
|
-
#
|
506
|
-
|
507
|
-
|
508
|
-
name
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
name=f"docker-config-{job_name}",
|
516
|
-
),
|
517
|
-
),
|
518
|
-
]
|
515
|
+
# Add the docker config map
|
516
|
+
volumes.append(
|
517
|
+
{
|
518
|
+
"name": "docker-config",
|
519
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
520
|
+
}
|
521
|
+
)
|
522
|
+
volume_mounts.append(
|
523
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker/"}
|
524
|
+
)
|
519
525
|
# Kaniko doesn't want https:// at the begining of the image tag.
|
520
526
|
destination = image_tag
|
521
527
|
if destination.startswith("https://"):
|
522
528
|
destination = destination.replace("https://", "")
|
523
|
-
args =
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
"--cache
|
528
|
-
|
529
|
-
"--
|
530
|
-
"--compressed-caching
|
529
|
+
args = {
|
530
|
+
"--context": build_context_path,
|
531
|
+
"--dockerfile": _WANDB_DOCKERFILE_NAME,
|
532
|
+
"--destination": destination,
|
533
|
+
"--cache": "true",
|
534
|
+
"--cache-repo": repository.replace("https://", ""),
|
535
|
+
"--snapshot-mode": "redo",
|
536
|
+
"--compressed-caching": "false",
|
537
|
+
}
|
538
|
+
for custom_arg in custom_args:
|
539
|
+
arg_name, arg_value = custom_arg.split("=", 1)
|
540
|
+
args[arg_name] = arg_value
|
541
|
+
parsed_args = [
|
542
|
+
f"{arg_name}={arg_value}" for arg_name, arg_value in args.items()
|
531
543
|
]
|
532
|
-
container =
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
volume_mounts=volume_mounts,
|
537
|
-
env=env if env else None,
|
538
|
-
)
|
539
|
-
# Create and configure a spec section
|
540
|
-
labels = {"wandb": "launch"}
|
544
|
+
container["args"] = parsed_args
|
545
|
+
|
546
|
+
# Apply the rest of our defaults
|
547
|
+
pod_labels["wandb"] = "launch"
|
541
548
|
# This annotation is required to enable azure workload identity.
|
542
549
|
if isinstance(self.registry, AzureContainerRegistry):
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
restart_policy="Never",
|
548
|
-
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
549
|
-
containers=[container],
|
550
|
-
volumes=volumes,
|
551
|
-
service_account_name=SERVICE_ACCOUNT_NAME,
|
552
|
-
),
|
550
|
+
pod_labels["azure.workload.identity/use"] = "true"
|
551
|
+
pod_spec["restartPolicy"] = pod_spec.get("restartPolicy", "Never")
|
552
|
+
pod_spec["activeDeadlineSeconds"] = pod_spec.get(
|
553
|
+
"activeDeadlineSeconds", _DEFAULT_BUILD_TIMEOUT_SECS
|
553
554
|
)
|
554
|
-
|
555
|
-
|
556
|
-
job = client.V1Job(
|
557
|
-
api_version="batch/v1",
|
558
|
-
kind="Job",
|
559
|
-
metadata=client.V1ObjectMeta(
|
560
|
-
name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
|
561
|
-
),
|
562
|
-
spec=spec,
|
555
|
+
pod_spec["serviceAccountName"] = pod_spec.get(
|
556
|
+
"serviceAccountName", SERVICE_ACCOUNT_NAME
|
563
557
|
)
|
558
|
+
job_spec["backoffLimit"] = job_spec.get("backoffLimit", 0)
|
559
|
+
job_labels["wandb"] = "launch"
|
560
|
+
job_metadata["namespace"] = job_metadata.get("namespace", NAMESPACE)
|
561
|
+
job_metadata["name"] = job_metadata.get("name", job_name)
|
562
|
+
job["apiVersion"] = "batch/v1"
|
563
|
+
job["kind"] = "Job"
|
564
|
+
|
565
|
+
# Apply all nested configs from the bottom up
|
566
|
+
pod_metadata["labels"] = pod_labels
|
567
|
+
pod_template["metadata"] = pod_metadata
|
568
|
+
container["name"] = container.get("name", "wandb-container-build")
|
569
|
+
container["image"] = container.get("image", self.image)
|
570
|
+
container["volumeMounts"] = volume_mounts
|
571
|
+
container["env"] = env
|
572
|
+
pod_spec["containers"] = [container]
|
573
|
+
pod_spec["volumes"] = volumes
|
574
|
+
pod_template["spec"] = pod_spec
|
575
|
+
job_spec["template"] = pod_template
|
576
|
+
job_metadata["labels"] = job_labels
|
577
|
+
job["metadata"] = job_metadata
|
578
|
+
job["spec"] = job_spec
|
564
579
|
return job
|
wandb/sdk/launch/create_job.py
CHANGED
@@ -11,7 +11,7 @@ from wandb.sdk.artifacts.artifact import Artifact
|
|
11
11
|
from wandb.sdk.internal.job_builder import JobBuilder
|
12
12
|
from wandb.sdk.launch.builder.build import get_current_python_version
|
13
13
|
from wandb.sdk.launch.git_reference import GitReference
|
14
|
-
from wandb.sdk.launch.utils import _is_git_uri
|
14
|
+
from wandb.sdk.launch.utils import _is_git_uri, get_entrypoint_file
|
15
15
|
from wandb.sdk.lib import filesystem
|
16
16
|
from wandb.util import make_artifact_name_safe
|
17
17
|
|
@@ -145,6 +145,7 @@ def _create_job(
|
|
145
145
|
|
146
146
|
job_builder = _configure_job_builder_for_partial(tempdir.name, job_source=job_type)
|
147
147
|
if job_type == "code":
|
148
|
+
assert entrypoint is not None
|
148
149
|
job_name = _make_code_artifact(
|
149
150
|
api=api,
|
150
151
|
job_builder=job_builder,
|
@@ -233,7 +234,6 @@ def _make_metadata_for_partial_job(
|
|
233
234
|
return metadata, None
|
234
235
|
|
235
236
|
if job_type == "code":
|
236
|
-
path, entrypoint = _handle_artifact_entrypoint(path, entrypoint)
|
237
237
|
if not entrypoint:
|
238
238
|
wandb.termerror(
|
239
239
|
"Artifact jobs must have an entrypoint, either included in the path or specified with -E"
|
@@ -304,15 +304,22 @@ def _create_repo_metadata(
|
|
304
304
|
with open(os.path.join(local_dir, ".python-version")) as f:
|
305
305
|
python_version = f.read().strip().splitlines()[0]
|
306
306
|
else:
|
307
|
-
|
308
|
-
python_version = f"{major}.{minor}"
|
307
|
+
_, python_version = get_current_python_version()
|
309
308
|
|
310
309
|
python_version = _clean_python_version(python_version)
|
311
310
|
|
312
311
|
# check if entrypoint is valid
|
313
312
|
assert entrypoint is not None
|
314
|
-
|
315
|
-
|
313
|
+
entrypoint_list = entrypoint.split(" ")
|
314
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
315
|
+
if not entrypoint_file:
|
316
|
+
wandb.termerror(
|
317
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
318
|
+
)
|
319
|
+
return None
|
320
|
+
|
321
|
+
if not os.path.exists(os.path.join(local_dir, entrypoint_file)):
|
322
|
+
wandb.termerror(f"Entrypoint file {entrypoint_file} not found in git repo")
|
316
323
|
return None
|
317
324
|
|
318
325
|
metadata = {
|
@@ -320,9 +327,9 @@ def _create_repo_metadata(
|
|
320
327
|
"commit": commit,
|
321
328
|
"remote": ref.url,
|
322
329
|
},
|
323
|
-
"codePathLocal":
|
324
|
-
"codePath":
|
325
|
-
"entrypoint":
|
330
|
+
"codePathLocal": entrypoint_file, # not in git context, optionally also set local
|
331
|
+
"codePath": entrypoint_file,
|
332
|
+
"entrypoint": entrypoint_list,
|
326
333
|
"python": python_version, # used to build container
|
327
334
|
"notebook": False, # partial jobs from notebooks not supported
|
328
335
|
}
|
@@ -332,10 +339,17 @@ def _create_repo_metadata(
|
|
332
339
|
|
333
340
|
def _create_artifact_metadata(
|
334
341
|
path: str, entrypoint: str, runtime: Optional[str] = None
|
335
|
-
) -> Tuple[Dict[str, Any], List[str]]:
|
342
|
+
) -> Tuple[Optional[Dict[str, Any]], Optional[List[str]]]:
|
336
343
|
if not os.path.isdir(path):
|
337
344
|
wandb.termerror("Path must be a valid file or directory")
|
338
345
|
return {}, []
|
346
|
+
entrypoint_list = entrypoint.split(" ")
|
347
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
348
|
+
if not entrypoint_file:
|
349
|
+
wandb.termerror(
|
350
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
351
|
+
)
|
352
|
+
return None, None
|
339
353
|
|
340
354
|
# read local requirements.txt and dump to temp dir for builder
|
341
355
|
requirements = []
|
@@ -347,41 +361,17 @@ def _create_artifact_metadata(
|
|
347
361
|
if runtime:
|
348
362
|
python_version = _clean_python_version(runtime)
|
349
363
|
else:
|
350
|
-
python_version =
|
364
|
+
python_version, _ = get_current_python_version()
|
365
|
+
python_version = _clean_python_version(python_version)
|
351
366
|
|
352
|
-
metadata = {
|
367
|
+
metadata = {
|
368
|
+
"python": python_version,
|
369
|
+
"codePath": entrypoint_file,
|
370
|
+
"entrypoint": entrypoint_list,
|
371
|
+
}
|
353
372
|
return metadata, requirements
|
354
373
|
|
355
374
|
|
356
|
-
def _handle_artifact_entrypoint(
|
357
|
-
path: str, entrypoint: Optional[str] = None
|
358
|
-
) -> Tuple[str, Optional[str]]:
|
359
|
-
if os.path.isfile(path):
|
360
|
-
if entrypoint and path.endswith(entrypoint):
|
361
|
-
path = path.replace(entrypoint, "")
|
362
|
-
wandb.termwarn(
|
363
|
-
f"Both entrypoint provided and path contains file. Using provided entrypoint: {entrypoint}, path is now: {path}"
|
364
|
-
)
|
365
|
-
elif entrypoint:
|
366
|
-
wandb.termwarn(
|
367
|
-
f"Ignoring passed in entrypoint as it does not match file path found in 'path'. Path entrypoint: {path.split('/')[-1]}"
|
368
|
-
)
|
369
|
-
entrypoint = path.split("/")[-1]
|
370
|
-
path = "/".join(path.split("/")[:-1])
|
371
|
-
elif not entrypoint:
|
372
|
-
wandb.termerror("Entrypoint not valid")
|
373
|
-
return "", None
|
374
|
-
path = path or "." # when path is just an entrypoint, use cdw
|
375
|
-
|
376
|
-
if not os.path.exists(os.path.join(path, entrypoint)):
|
377
|
-
wandb.termerror(
|
378
|
-
f"Could not find execution point: {os.path.join(path, entrypoint)}"
|
379
|
-
)
|
380
|
-
return "", None
|
381
|
-
|
382
|
-
return path, entrypoint
|
383
|
-
|
384
|
-
|
385
375
|
def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuilder:
|
386
376
|
"""Configure job builder with temp dir and job source."""
|
387
377
|
# adjust git source to repo
|
@@ -411,7 +401,7 @@ def _make_code_artifact(
|
|
411
401
|
job_builder: JobBuilder,
|
412
402
|
run: "wandb.sdk.wandb_run.Run",
|
413
403
|
path: str,
|
414
|
-
entrypoint:
|
404
|
+
entrypoint: str,
|
415
405
|
entity: Optional[str],
|
416
406
|
project: Optional[str],
|
417
407
|
name: Optional[str],
|
@@ -420,17 +410,22 @@ def _make_code_artifact(
|
|
420
410
|
|
421
411
|
Returns the name of the eventual job.
|
422
412
|
"""
|
423
|
-
|
413
|
+
assert entrypoint is not None
|
414
|
+
entrypoint_list = entrypoint.split(" ")
|
415
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
416
|
+
if not entrypoint_file:
|
417
|
+
wandb.termerror(
|
418
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
419
|
+
)
|
420
|
+
return None
|
421
|
+
|
422
|
+
artifact_name = _make_code_artifact_name(os.path.join(path, entrypoint_file), name)
|
424
423
|
code_artifact = wandb.Artifact(
|
425
424
|
name=artifact_name,
|
426
425
|
type="code",
|
427
426
|
description="Code artifact for job",
|
428
427
|
)
|
429
428
|
|
430
|
-
# Update path and entrypoint vars to match metadata
|
431
|
-
# TODO(gst): consolidate into one place
|
432
|
-
path, entrypoint = _handle_artifact_entrypoint(path, entrypoint)
|
433
|
-
|
434
429
|
try:
|
435
430
|
code_artifact.add_dir(path)
|
436
431
|
except Exception as e:
|
@@ -451,7 +446,7 @@ def _make_code_artifact(
|
|
451
446
|
project_name=project,
|
452
447
|
run_name=run.id, # run will be deleted after creation
|
453
448
|
description="Code artifact for job",
|
454
|
-
metadata={"codePath": path, "entrypoint":
|
449
|
+
metadata={"codePath": path, "entrypoint": entrypoint_file},
|
455
450
|
is_user_created=True,
|
456
451
|
aliases=[
|
457
452
|
{"artifactCollectionName": artifact_name, "alias": a} for a in ["latest"]
|
@@ -433,6 +433,8 @@ class SafeWatch:
|
|
433
433
|
del kwargs["resource_version"]
|
434
434
|
self._last_seen_resource_version = None
|
435
435
|
except Exception as E:
|
436
|
+
exc_type = type(E).__name__
|
437
|
+
stack_trace = traceback.format_exc()
|
436
438
|
wandb.termerror(
|
437
|
-
f"Unknown exception in event stream: {E}, attempting to recover"
|
439
|
+
f"Unknown exception in event stream of type {exc_type}: {E}, attempting to recover. Stack trace: {stack_trace}"
|
438
440
|
)
|
@@ -157,7 +157,9 @@ class Scheduler(ABC):
|
|
157
157
|
self._runs: Dict[str, SweepRun] = {}
|
158
158
|
# Threading lock to ensure thread-safe access to the runs dictionary
|
159
159
|
self._threading_lock: threading.Lock = threading.Lock()
|
160
|
-
self._polling_sleep =
|
160
|
+
self._polling_sleep = (
|
161
|
+
polling_sleep if polling_sleep is not None else DEFAULT_POLLING_SLEEP
|
162
|
+
)
|
161
163
|
self._project_queue = project_queue
|
162
164
|
# Optionally run multiple workers in (pseudo-)parallel. Workers do not
|
163
165
|
# actually run training workloads, they simply send heartbeat messages
|
wandb/sdk/launch/utils.py
CHANGED
@@ -846,3 +846,21 @@ def fetch_and_validate_template_variables(
|
|
846
846
|
raise LaunchError(f"Value for {key} must be of type {field_type}.")
|
847
847
|
template_variables[key] = val
|
848
848
|
return template_variables
|
849
|
+
|
850
|
+
|
851
|
+
def get_entrypoint_file(entrypoint: List[str]) -> Optional[str]:
|
852
|
+
"""Get the entrypoint file from the given command.
|
853
|
+
|
854
|
+
Args:
|
855
|
+
entrypoint (List[str]): List of command and arguments.
|
856
|
+
|
857
|
+
Returns:
|
858
|
+
Optional[str]: The entrypoint file if found, otherwise None.
|
859
|
+
"""
|
860
|
+
if not entrypoint:
|
861
|
+
return None
|
862
|
+
if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
|
863
|
+
return entrypoint[0]
|
864
|
+
if len(entrypoint) < 2:
|
865
|
+
return None
|
866
|
+
return entrypoint[1]
|
wandb/sdk/lib/run_moment.py
CHANGED
@@ -1,7 +1,13 @@
|
|
1
|
+
import sys
|
1
2
|
from dataclasses import dataclass
|
2
|
-
from typing import
|
3
|
+
from typing import Union, cast
|
3
4
|
from urllib import parse
|
4
5
|
|
6
|
+
if sys.version_info >= (3, 8):
|
7
|
+
from typing import Literal
|
8
|
+
else:
|
9
|
+
from typing_extensions import Literal
|
10
|
+
|
5
11
|
_STEP = Literal["_step"]
|
6
12
|
|
7
13
|
|
wandb/sdk/wandb_init.py
CHANGED
@@ -195,12 +195,6 @@ class _WandbInit:
|
|
195
195
|
# Start with settings from wandb library singleton
|
196
196
|
settings: Settings = self._wl.settings.copy()
|
197
197
|
|
198
|
-
# when using launch, we don't want to reuse the same run id from the singleton
|
199
|
-
# since users might launch multiple runs in the same process
|
200
|
-
# TODO(kdg): allow users to control this via launch settings
|
201
|
-
if settings.launch and singleton is not None:
|
202
|
-
settings.update({"run_id": None}, source=Source.INIT)
|
203
|
-
|
204
198
|
settings_param = kwargs.pop("settings", None)
|
205
199
|
if settings_param is not None and isinstance(settings_param, (Settings, dict)):
|
206
200
|
settings.update(settings_param, source=Source.INIT)
|
@@ -1124,10 +1118,10 @@ def init(
|
|
1124
1118
|
for saving hyperparameters to compare across runs. The ID cannot
|
1125
1119
|
contain the following special characters: `/\#?%:`.
|
1126
1120
|
See [our guide to resuming runs](https://docs.wandb.com/guides/runs/resuming).
|
1127
|
-
fork_from: (str, optional) A string with the format
|
1121
|
+
fork_from: (str, optional) A string with the format {run_id}?_step={step} describing
|
1128
1122
|
a moment in a previous run to fork a new run from. Creates a new run that picks up
|
1129
1123
|
logging history from the specified run at the specified moment. The target run must
|
1130
|
-
be in the current project.
|
1124
|
+
be in the current project. Example: `fork_from="my-run-id?_step=1234"`.
|
1131
1125
|
|
1132
1126
|
Examples:
|
1133
1127
|
### Set where the run is logged
|