wandb 0.16.5__py3-none-any.whl → 0.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +1 -1
- wandb/cli/cli.py +5 -2
- wandb/integration/openai/fine_tuning.py +74 -37
- wandb/proto/v3/wandb_internal_pb2.py +192 -192
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_internal_pb2.py +192 -192
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/sdk/artifacts/artifact.py +25 -9
- wandb/sdk/artifacts/artifact_saver.py +16 -19
- wandb/sdk/interface/interface.py +18 -6
- wandb/sdk/launch/_launch.py +5 -0
- wandb/sdk/launch/_project_spec.py +5 -20
- wandb/sdk/launch/agent/agent.py +80 -37
- wandb/sdk/launch/agent/config.py +8 -0
- wandb/sdk/launch/builder/kaniko_builder.py +149 -134
- wandb/sdk/launch/create_job.py +43 -48
- wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
- wandb/sdk/launch/sweeps/scheduler.py +3 -1
- wandb/sdk/launch/utils.py +18 -0
- wandb/sdk/lib/_settings_toposort_generated.py +1 -0
- wandb/sdk/lib/run_moment.py +7 -1
- wandb/sdk/wandb_init.py +2 -8
- wandb/sdk/wandb_run.py +50 -34
- wandb/sdk/wandb_settings.py +2 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/RECORD +30 -30
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/WHEEL +0 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.5.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
import copy
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import os
|
@@ -8,7 +9,7 @@ import tarfile
|
|
8
9
|
import tempfile
|
9
10
|
import time
|
10
11
|
import traceback
|
11
|
-
from typing import Optional
|
12
|
+
from typing import Any, Dict, Optional
|
12
13
|
|
13
14
|
import wandb
|
14
15
|
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
@@ -105,6 +106,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
105
106
|
secret_name: str = "",
|
106
107
|
secret_key: str = "",
|
107
108
|
image: str = "gcr.io/kaniko-project/executor:v1.11.0",
|
109
|
+
config: Optional[dict] = None,
|
108
110
|
):
|
109
111
|
"""Initialize a KanikoBuilder.
|
110
112
|
|
@@ -125,6 +127,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
125
127
|
self.secret_name = secret_name
|
126
128
|
self.secret_key = secret_key
|
127
129
|
self.image = image
|
130
|
+
self.kaniko_config = config or {}
|
128
131
|
|
129
132
|
@classmethod
|
130
133
|
def from_config(
|
@@ -170,6 +173,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
170
173
|
image_uri = config.get("destination")
|
171
174
|
if image_uri is not None:
|
172
175
|
registry = registry_from_uri(image_uri)
|
176
|
+
kaniko_config = config.get("kaniko-config", {})
|
173
177
|
|
174
178
|
return cls(
|
175
179
|
environment,
|
@@ -179,6 +183,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
179
183
|
secret_name=secret_name,
|
180
184
|
secret_key=secret_key,
|
181
185
|
image=kaniko_image,
|
186
|
+
config=kaniko_config,
|
182
187
|
)
|
183
188
|
|
184
189
|
async def verify(self) -> None:
|
@@ -289,7 +294,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
289
294
|
|
290
295
|
build_context = await self._upload_build_context(run_id, context_path)
|
291
296
|
build_job = await self._create_kaniko_job(
|
292
|
-
build_job_name, repo_uri, image_uri, build_context, core_v1
|
297
|
+
build_job_name, repo_uri, image_uri, build_context, core_v1, api_client
|
293
298
|
)
|
294
299
|
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
295
300
|
|
@@ -324,7 +329,9 @@ class KanikoBuilder(AbstractBuilder):
|
|
324
329
|
):
|
325
330
|
if job_tracker:
|
326
331
|
job_tracker.set_err_stage("build")
|
327
|
-
raise Exception(
|
332
|
+
raise Exception(
|
333
|
+
f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
|
334
|
+
)
|
328
335
|
try:
|
329
336
|
pods_from_job = await core_v1.list_namespaced_pod(
|
330
337
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
@@ -371,23 +378,32 @@ class KanikoBuilder(AbstractBuilder):
|
|
371
378
|
image_tag: str,
|
372
379
|
build_context_path: str,
|
373
380
|
core_client: client.CoreV1Api,
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
381
|
+
api_client,
|
382
|
+
) -> Dict[str, Any]:
|
383
|
+
job = copy.deepcopy(self.kaniko_config)
|
384
|
+
job_metadata = job.get("metadata", {})
|
385
|
+
job_labels = job_metadata.get("labels", {})
|
386
|
+
job_spec = job.get("spec", {})
|
387
|
+
pod_template = job_spec.get("template", {})
|
388
|
+
pod_metadata = pod_template.get("metadata", {})
|
389
|
+
pod_labels = pod_metadata.get("labels", {})
|
390
|
+
pod_spec = pod_template.get("spec", {})
|
391
|
+
volumes = pod_spec.get("volumes", [])
|
392
|
+
containers = pod_spec.get("containers") or [{}]
|
393
|
+
if len(containers) > 1:
|
394
|
+
raise LaunchError(
|
395
|
+
"Multiple container configs not supported for kaniko builder."
|
396
|
+
)
|
397
|
+
container = containers[0]
|
398
|
+
volume_mounts = container.get("volumeMounts", [])
|
399
|
+
env = container.get("env", [])
|
400
|
+
custom_args = container.get("args", [])
|
378
401
|
|
379
402
|
if PVC_MOUNT_PATH:
|
380
403
|
volumes.append(
|
381
|
-
|
382
|
-
name="kaniko-pvc",
|
383
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
384
|
-
claim_name=PVC_NAME
|
385
|
-
),
|
386
|
-
)
|
387
|
-
)
|
388
|
-
volume_mounts.append(
|
389
|
-
client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
|
404
|
+
{"name": "kaniko-pvc", "persistentVolumeClaim": {"claimName": PVC_NAME}}
|
390
405
|
)
|
406
|
+
volume_mounts.append({"name": "kaniko-pvc", "mountPath": "/context"})
|
391
407
|
|
392
408
|
if bool(self.secret_name) != bool(self.secret_key):
|
393
409
|
raise LaunchError(
|
@@ -395,13 +411,13 @@ class KanikoBuilder(AbstractBuilder):
|
|
395
411
|
"for kaniko build. You provided only one of them."
|
396
412
|
)
|
397
413
|
if isinstance(self.registry, ElasticContainerRegistry):
|
398
|
-
env
|
399
|
-
|
400
|
-
name
|
401
|
-
value
|
402
|
-
|
403
|
-
|
404
|
-
# TODO: Refactor all of this environment/registry
|
414
|
+
env.append(
|
415
|
+
{
|
416
|
+
"name": "AWS_REGION",
|
417
|
+
"value": self.registry.region,
|
418
|
+
}
|
419
|
+
)
|
420
|
+
# TODO(ben): Refactor all of this environment/registry
|
405
421
|
# specific stuff into methods of those classes.
|
406
422
|
if isinstance(self.environment, AzureEnvironment):
|
407
423
|
# Use the core api to check if the secret exists
|
@@ -416,52 +432,46 @@ class KanikoBuilder(AbstractBuilder):
|
|
416
432
|
"namespace wandb. Please create it with the key password "
|
417
433
|
"set to your azure storage access key."
|
418
434
|
) from e
|
419
|
-
env
|
420
|
-
|
421
|
-
name
|
422
|
-
|
423
|
-
|
424
|
-
name
|
425
|
-
key
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
435
|
+
env.append(
|
436
|
+
{
|
437
|
+
"name": "AZURE_STORAGE_ACCESS_KEY",
|
438
|
+
"valueFrom": {
|
439
|
+
"secretKeyRef": {
|
440
|
+
"name": "azure-storage-access-key",
|
441
|
+
"key": "password",
|
442
|
+
}
|
443
|
+
},
|
444
|
+
}
|
445
|
+
)
|
430
446
|
if DOCKER_CONFIG_SECRET:
|
431
447
|
volumes.append(
|
432
|
-
|
433
|
-
name
|
434
|
-
secret
|
435
|
-
|
436
|
-
items
|
437
|
-
|
438
|
-
key
|
439
|
-
|
448
|
+
{
|
449
|
+
"name": "kaniko-docker-config",
|
450
|
+
"secret": {
|
451
|
+
"secretName": DOCKER_CONFIG_SECRET,
|
452
|
+
"items": [
|
453
|
+
{
|
454
|
+
"key": ".dockerconfigjson",
|
455
|
+
"path": "config.json",
|
456
|
+
}
|
440
457
|
],
|
441
|
-
|
442
|
-
|
458
|
+
},
|
459
|
+
}
|
443
460
|
)
|
444
461
|
volume_mounts.append(
|
445
|
-
|
446
|
-
name="kaniko-docker-config",
|
447
|
-
mount_path="/kaniko/.docker",
|
448
|
-
)
|
462
|
+
{"name": "kaniko-docker-config", "mountPath": "/kaniko/.docker"}
|
449
463
|
)
|
450
464
|
elif self.secret_name and self.secret_key:
|
451
|
-
volumes
|
452
|
-
|
453
|
-
name
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
name="docker-config", mount_path="/kaniko/.docker/"
|
462
|
-
),
|
463
|
-
]
|
464
|
-
# TODO: I don't like conditioning on the registry type here. As a
|
465
|
+
volumes.append(
|
466
|
+
{
|
467
|
+
"name": "docker-config",
|
468
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
469
|
+
}
|
470
|
+
)
|
471
|
+
volume_mounts.append(
|
472
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker"}
|
473
|
+
)
|
474
|
+
# TODO(ben): I don't like conditioning on the registry type here. As a
|
465
475
|
# future change I want the registry and environment classes to provide
|
466
476
|
# a list of environment variables and volume mounts that need to be
|
467
477
|
# added to the job. The environment class provides credentials for
|
@@ -475,90 +485,95 @@ class KanikoBuilder(AbstractBuilder):
|
|
475
485
|
elif isinstance(self.registry, GoogleArtifactRegistry):
|
476
486
|
mount_path = "/kaniko/.config/gcloud"
|
477
487
|
key = "config.json"
|
478
|
-
env
|
479
|
-
|
480
|
-
name
|
481
|
-
value
|
482
|
-
|
483
|
-
|
488
|
+
env.append(
|
489
|
+
{
|
490
|
+
"name": "GOOGLE_APPLICATION_CREDENTIALS",
|
491
|
+
"value": "/kaniko/.config/gcloud/config.json",
|
492
|
+
}
|
493
|
+
)
|
484
494
|
else:
|
485
495
|
raise LaunchError(
|
486
496
|
f"Registry type {type(self.registry)} not supported by kaniko"
|
487
497
|
)
|
488
|
-
|
489
|
-
|
490
|
-
name
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
498
|
+
volumes.append(
|
499
|
+
{
|
500
|
+
"name": self.secret_name,
|
501
|
+
"secret": {
|
502
|
+
"secretName": self.secret_name,
|
503
|
+
"items": [{"key": self.secret_key, "path": key}],
|
504
|
+
},
|
505
|
+
}
|
506
|
+
)
|
507
|
+
volume_mounts.append(
|
508
|
+
{
|
509
|
+
"name": self.secret_name,
|
510
|
+
"mountPath": mount_path,
|
511
|
+
"readOnly": True,
|
512
|
+
}
|
513
|
+
)
|
504
514
|
if isinstance(self.registry, AzureContainerRegistry):
|
505
|
-
#
|
506
|
-
|
507
|
-
|
508
|
-
name
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
name=f"docker-config-{job_name}",
|
516
|
-
),
|
517
|
-
),
|
518
|
-
]
|
515
|
+
# Add the docker config map
|
516
|
+
volumes.append(
|
517
|
+
{
|
518
|
+
"name": "docker-config",
|
519
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
520
|
+
}
|
521
|
+
)
|
522
|
+
volume_mounts.append(
|
523
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker/"}
|
524
|
+
)
|
519
525
|
# Kaniko doesn't want https:// at the begining of the image tag.
|
520
526
|
destination = image_tag
|
521
527
|
if destination.startswith("https://"):
|
522
528
|
destination = destination.replace("https://", "")
|
523
|
-
args =
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
"--cache
|
528
|
-
|
529
|
-
"--
|
530
|
-
"--compressed-caching
|
529
|
+
args = {
|
530
|
+
"--context": build_context_path,
|
531
|
+
"--dockerfile": _WANDB_DOCKERFILE_NAME,
|
532
|
+
"--destination": destination,
|
533
|
+
"--cache": "true",
|
534
|
+
"--cache-repo": repository.replace("https://", ""),
|
535
|
+
"--snapshot-mode": "redo",
|
536
|
+
"--compressed-caching": "false",
|
537
|
+
}
|
538
|
+
for custom_arg in custom_args:
|
539
|
+
arg_name, arg_value = custom_arg.split("=", 1)
|
540
|
+
args[arg_name] = arg_value
|
541
|
+
parsed_args = [
|
542
|
+
f"{arg_name}={arg_value}" for arg_name, arg_value in args.items()
|
531
543
|
]
|
532
|
-
container =
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
volume_mounts=volume_mounts,
|
537
|
-
env=env if env else None,
|
538
|
-
)
|
539
|
-
# Create and configure a spec section
|
540
|
-
labels = {"wandb": "launch"}
|
544
|
+
container["args"] = parsed_args
|
545
|
+
|
546
|
+
# Apply the rest of our defaults
|
547
|
+
pod_labels["wandb"] = "launch"
|
541
548
|
# This annotation is required to enable azure workload identity.
|
542
549
|
if isinstance(self.registry, AzureContainerRegistry):
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
restart_policy="Never",
|
548
|
-
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
549
|
-
containers=[container],
|
550
|
-
volumes=volumes,
|
551
|
-
service_account_name=SERVICE_ACCOUNT_NAME,
|
552
|
-
),
|
550
|
+
pod_labels["azure.workload.identity/use"] = "true"
|
551
|
+
pod_spec["restartPolicy"] = pod_spec.get("restartPolicy", "Never")
|
552
|
+
pod_spec["activeDeadlineSeconds"] = pod_spec.get(
|
553
|
+
"activeDeadlineSeconds", _DEFAULT_BUILD_TIMEOUT_SECS
|
553
554
|
)
|
554
|
-
|
555
|
-
|
556
|
-
job = client.V1Job(
|
557
|
-
api_version="batch/v1",
|
558
|
-
kind="Job",
|
559
|
-
metadata=client.V1ObjectMeta(
|
560
|
-
name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
|
561
|
-
),
|
562
|
-
spec=spec,
|
555
|
+
pod_spec["serviceAccountName"] = pod_spec.get(
|
556
|
+
"serviceAccountName", SERVICE_ACCOUNT_NAME
|
563
557
|
)
|
558
|
+
job_spec["backoffLimit"] = job_spec.get("backoffLimit", 0)
|
559
|
+
job_labels["wandb"] = "launch"
|
560
|
+
job_metadata["namespace"] = job_metadata.get("namespace", NAMESPACE)
|
561
|
+
job_metadata["name"] = job_metadata.get("name", job_name)
|
562
|
+
job["apiVersion"] = "batch/v1"
|
563
|
+
job["kind"] = "Job"
|
564
|
+
|
565
|
+
# Apply all nested configs from the bottom up
|
566
|
+
pod_metadata["labels"] = pod_labels
|
567
|
+
pod_template["metadata"] = pod_metadata
|
568
|
+
container["name"] = container.get("name", "wandb-container-build")
|
569
|
+
container["image"] = container.get("image", self.image)
|
570
|
+
container["volumeMounts"] = volume_mounts
|
571
|
+
container["env"] = env
|
572
|
+
pod_spec["containers"] = [container]
|
573
|
+
pod_spec["volumes"] = volumes
|
574
|
+
pod_template["spec"] = pod_spec
|
575
|
+
job_spec["template"] = pod_template
|
576
|
+
job_metadata["labels"] = job_labels
|
577
|
+
job["metadata"] = job_metadata
|
578
|
+
job["spec"] = job_spec
|
564
579
|
return job
|
wandb/sdk/launch/create_job.py
CHANGED
@@ -11,7 +11,7 @@ from wandb.sdk.artifacts.artifact import Artifact
|
|
11
11
|
from wandb.sdk.internal.job_builder import JobBuilder
|
12
12
|
from wandb.sdk.launch.builder.build import get_current_python_version
|
13
13
|
from wandb.sdk.launch.git_reference import GitReference
|
14
|
-
from wandb.sdk.launch.utils import _is_git_uri
|
14
|
+
from wandb.sdk.launch.utils import _is_git_uri, get_entrypoint_file
|
15
15
|
from wandb.sdk.lib import filesystem
|
16
16
|
from wandb.util import make_artifact_name_safe
|
17
17
|
|
@@ -145,6 +145,7 @@ def _create_job(
|
|
145
145
|
|
146
146
|
job_builder = _configure_job_builder_for_partial(tempdir.name, job_source=job_type)
|
147
147
|
if job_type == "code":
|
148
|
+
assert entrypoint is not None
|
148
149
|
job_name = _make_code_artifact(
|
149
150
|
api=api,
|
150
151
|
job_builder=job_builder,
|
@@ -233,7 +234,6 @@ def _make_metadata_for_partial_job(
|
|
233
234
|
return metadata, None
|
234
235
|
|
235
236
|
if job_type == "code":
|
236
|
-
path, entrypoint = _handle_artifact_entrypoint(path, entrypoint)
|
237
237
|
if not entrypoint:
|
238
238
|
wandb.termerror(
|
239
239
|
"Artifact jobs must have an entrypoint, either included in the path or specified with -E"
|
@@ -304,15 +304,22 @@ def _create_repo_metadata(
|
|
304
304
|
with open(os.path.join(local_dir, ".python-version")) as f:
|
305
305
|
python_version = f.read().strip().splitlines()[0]
|
306
306
|
else:
|
307
|
-
|
308
|
-
python_version = f"{major}.{minor}"
|
307
|
+
_, python_version = get_current_python_version()
|
309
308
|
|
310
309
|
python_version = _clean_python_version(python_version)
|
311
310
|
|
312
311
|
# check if entrypoint is valid
|
313
312
|
assert entrypoint is not None
|
314
|
-
|
315
|
-
|
313
|
+
entrypoint_list = entrypoint.split(" ")
|
314
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
315
|
+
if not entrypoint_file:
|
316
|
+
wandb.termerror(
|
317
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
318
|
+
)
|
319
|
+
return None
|
320
|
+
|
321
|
+
if not os.path.exists(os.path.join(local_dir, entrypoint_file)):
|
322
|
+
wandb.termerror(f"Entrypoint file {entrypoint_file} not found in git repo")
|
316
323
|
return None
|
317
324
|
|
318
325
|
metadata = {
|
@@ -320,9 +327,9 @@ def _create_repo_metadata(
|
|
320
327
|
"commit": commit,
|
321
328
|
"remote": ref.url,
|
322
329
|
},
|
323
|
-
"codePathLocal":
|
324
|
-
"codePath":
|
325
|
-
"entrypoint":
|
330
|
+
"codePathLocal": entrypoint_file, # not in git context, optionally also set local
|
331
|
+
"codePath": entrypoint_file,
|
332
|
+
"entrypoint": entrypoint_list,
|
326
333
|
"python": python_version, # used to build container
|
327
334
|
"notebook": False, # partial jobs from notebooks not supported
|
328
335
|
}
|
@@ -332,10 +339,17 @@ def _create_repo_metadata(
|
|
332
339
|
|
333
340
|
def _create_artifact_metadata(
|
334
341
|
path: str, entrypoint: str, runtime: Optional[str] = None
|
335
|
-
) -> Tuple[Dict[str, Any], List[str]]:
|
342
|
+
) -> Tuple[Optional[Dict[str, Any]], Optional[List[str]]]:
|
336
343
|
if not os.path.isdir(path):
|
337
344
|
wandb.termerror("Path must be a valid file or directory")
|
338
345
|
return {}, []
|
346
|
+
entrypoint_list = entrypoint.split(" ")
|
347
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
348
|
+
if not entrypoint_file:
|
349
|
+
wandb.termerror(
|
350
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
351
|
+
)
|
352
|
+
return None, None
|
339
353
|
|
340
354
|
# read local requirements.txt and dump to temp dir for builder
|
341
355
|
requirements = []
|
@@ -347,41 +361,17 @@ def _create_artifact_metadata(
|
|
347
361
|
if runtime:
|
348
362
|
python_version = _clean_python_version(runtime)
|
349
363
|
else:
|
350
|
-
python_version =
|
364
|
+
python_version, _ = get_current_python_version()
|
365
|
+
python_version = _clean_python_version(python_version)
|
351
366
|
|
352
|
-
metadata = {
|
367
|
+
metadata = {
|
368
|
+
"python": python_version,
|
369
|
+
"codePath": entrypoint_file,
|
370
|
+
"entrypoint": entrypoint_list,
|
371
|
+
}
|
353
372
|
return metadata, requirements
|
354
373
|
|
355
374
|
|
356
|
-
def _handle_artifact_entrypoint(
|
357
|
-
path: str, entrypoint: Optional[str] = None
|
358
|
-
) -> Tuple[str, Optional[str]]:
|
359
|
-
if os.path.isfile(path):
|
360
|
-
if entrypoint and path.endswith(entrypoint):
|
361
|
-
path = path.replace(entrypoint, "")
|
362
|
-
wandb.termwarn(
|
363
|
-
f"Both entrypoint provided and path contains file. Using provided entrypoint: {entrypoint}, path is now: {path}"
|
364
|
-
)
|
365
|
-
elif entrypoint:
|
366
|
-
wandb.termwarn(
|
367
|
-
f"Ignoring passed in entrypoint as it does not match file path found in 'path'. Path entrypoint: {path.split('/')[-1]}"
|
368
|
-
)
|
369
|
-
entrypoint = path.split("/")[-1]
|
370
|
-
path = "/".join(path.split("/")[:-1])
|
371
|
-
elif not entrypoint:
|
372
|
-
wandb.termerror("Entrypoint not valid")
|
373
|
-
return "", None
|
374
|
-
path = path or "." # when path is just an entrypoint, use cdw
|
375
|
-
|
376
|
-
if not os.path.exists(os.path.join(path, entrypoint)):
|
377
|
-
wandb.termerror(
|
378
|
-
f"Could not find execution point: {os.path.join(path, entrypoint)}"
|
379
|
-
)
|
380
|
-
return "", None
|
381
|
-
|
382
|
-
return path, entrypoint
|
383
|
-
|
384
|
-
|
385
375
|
def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuilder:
|
386
376
|
"""Configure job builder with temp dir and job source."""
|
387
377
|
# adjust git source to repo
|
@@ -411,7 +401,7 @@ def _make_code_artifact(
|
|
411
401
|
job_builder: JobBuilder,
|
412
402
|
run: "wandb.sdk.wandb_run.Run",
|
413
403
|
path: str,
|
414
|
-
entrypoint:
|
404
|
+
entrypoint: str,
|
415
405
|
entity: Optional[str],
|
416
406
|
project: Optional[str],
|
417
407
|
name: Optional[str],
|
@@ -420,17 +410,22 @@ def _make_code_artifact(
|
|
420
410
|
|
421
411
|
Returns the name of the eventual job.
|
422
412
|
"""
|
423
|
-
|
413
|
+
assert entrypoint is not None
|
414
|
+
entrypoint_list = entrypoint.split(" ")
|
415
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
416
|
+
if not entrypoint_file:
|
417
|
+
wandb.termerror(
|
418
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
419
|
+
)
|
420
|
+
return None
|
421
|
+
|
422
|
+
artifact_name = _make_code_artifact_name(os.path.join(path, entrypoint_file), name)
|
424
423
|
code_artifact = wandb.Artifact(
|
425
424
|
name=artifact_name,
|
426
425
|
type="code",
|
427
426
|
description="Code artifact for job",
|
428
427
|
)
|
429
428
|
|
430
|
-
# Update path and entrypoint vars to match metadata
|
431
|
-
# TODO(gst): consolidate into one place
|
432
|
-
path, entrypoint = _handle_artifact_entrypoint(path, entrypoint)
|
433
|
-
|
434
429
|
try:
|
435
430
|
code_artifact.add_dir(path)
|
436
431
|
except Exception as e:
|
@@ -451,7 +446,7 @@ def _make_code_artifact(
|
|
451
446
|
project_name=project,
|
452
447
|
run_name=run.id, # run will be deleted after creation
|
453
448
|
description="Code artifact for job",
|
454
|
-
metadata={"codePath": path, "entrypoint":
|
449
|
+
metadata={"codePath": path, "entrypoint": entrypoint_file},
|
455
450
|
is_user_created=True,
|
456
451
|
aliases=[
|
457
452
|
{"artifactCollectionName": artifact_name, "alias": a} for a in ["latest"]
|
@@ -433,6 +433,8 @@ class SafeWatch:
|
|
433
433
|
del kwargs["resource_version"]
|
434
434
|
self._last_seen_resource_version = None
|
435
435
|
except Exception as E:
|
436
|
+
exc_type = type(E).__name__
|
437
|
+
stack_trace = traceback.format_exc()
|
436
438
|
wandb.termerror(
|
437
|
-
f"Unknown exception in event stream: {E}, attempting to recover"
|
439
|
+
f"Unknown exception in event stream of type {exc_type}: {E}, attempting to recover. Stack trace: {stack_trace}"
|
438
440
|
)
|
@@ -157,7 +157,9 @@ class Scheduler(ABC):
|
|
157
157
|
self._runs: Dict[str, SweepRun] = {}
|
158
158
|
# Threading lock to ensure thread-safe access to the runs dictionary
|
159
159
|
self._threading_lock: threading.Lock = threading.Lock()
|
160
|
-
self._polling_sleep =
|
160
|
+
self._polling_sleep = (
|
161
|
+
polling_sleep if polling_sleep is not None else DEFAULT_POLLING_SLEEP
|
162
|
+
)
|
161
163
|
self._project_queue = project_queue
|
162
164
|
# Optionally run multiple workers in (pseudo-)parallel. Workers do not
|
163
165
|
# actually run training workloads, they simply send heartbeat messages
|
wandb/sdk/launch/utils.py
CHANGED
@@ -846,3 +846,21 @@ def fetch_and_validate_template_variables(
|
|
846
846
|
raise LaunchError(f"Value for {key} must be of type {field_type}.")
|
847
847
|
template_variables[key] = val
|
848
848
|
return template_variables
|
849
|
+
|
850
|
+
|
851
|
+
def get_entrypoint_file(entrypoint: List[str]) -> Optional[str]:
|
852
|
+
"""Get the entrypoint file from the given command.
|
853
|
+
|
854
|
+
Args:
|
855
|
+
entrypoint (List[str]): List of command and arguments.
|
856
|
+
|
857
|
+
Returns:
|
858
|
+
Optional[str]: The entrypoint file if found, otherwise None.
|
859
|
+
"""
|
860
|
+
if not entrypoint:
|
861
|
+
return None
|
862
|
+
if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
|
863
|
+
return entrypoint[0]
|
864
|
+
if len(entrypoint) < 2:
|
865
|
+
return None
|
866
|
+
return entrypoint[1]
|
wandb/sdk/lib/run_moment.py
CHANGED
@@ -1,7 +1,13 @@
|
|
1
|
+
import sys
|
1
2
|
from dataclasses import dataclass
|
2
|
-
from typing import
|
3
|
+
from typing import Union, cast
|
3
4
|
from urllib import parse
|
4
5
|
|
6
|
+
if sys.version_info >= (3, 8):
|
7
|
+
from typing import Literal
|
8
|
+
else:
|
9
|
+
from typing_extensions import Literal
|
10
|
+
|
5
11
|
_STEP = Literal["_step"]
|
6
12
|
|
7
13
|
|
wandb/sdk/wandb_init.py
CHANGED
@@ -195,12 +195,6 @@ class _WandbInit:
|
|
195
195
|
# Start with settings from wandb library singleton
|
196
196
|
settings: Settings = self._wl.settings.copy()
|
197
197
|
|
198
|
-
# when using launch, we don't want to reuse the same run id from the singleton
|
199
|
-
# since users might launch multiple runs in the same process
|
200
|
-
# TODO(kdg): allow users to control this via launch settings
|
201
|
-
if settings.launch and singleton is not None:
|
202
|
-
settings.update({"run_id": None}, source=Source.INIT)
|
203
|
-
|
204
198
|
settings_param = kwargs.pop("settings", None)
|
205
199
|
if settings_param is not None and isinstance(settings_param, (Settings, dict)):
|
206
200
|
settings.update(settings_param, source=Source.INIT)
|
@@ -1124,10 +1118,10 @@ def init(
|
|
1124
1118
|
for saving hyperparameters to compare across runs. The ID cannot
|
1125
1119
|
contain the following special characters: `/\#?%:`.
|
1126
1120
|
See [our guide to resuming runs](https://docs.wandb.com/guides/runs/resuming).
|
1127
|
-
fork_from: (str, optional) A string with the format
|
1121
|
+
fork_from: (str, optional) A string with the format {run_id}?_step={step} describing
|
1128
1122
|
a moment in a previous run to fork a new run from. Creates a new run that picks up
|
1129
1123
|
logging history from the specified run at the specified moment. The target run must
|
1130
|
-
be in the current project.
|
1124
|
+
be in the current project. Example: `fork_from="my-run-id?_step=1234"`.
|
1131
1125
|
|
1132
1126
|
Examples:
|
1133
1127
|
### Set where the run is logged
|