wandb 0.16.5__py3-none-any.whl → 0.17.0rc1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- package_readme.md +95 -0
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +0 -1
- wandb/analytics/sentry.py +2 -1
- wandb/apis/importers/internals/protocols.py +30 -56
- wandb/apis/importers/mlflow.py +13 -26
- wandb/apis/importers/wandb.py +8 -14
- wandb/apis/public/api.py +1 -0
- wandb/apis/public/artifacts.py +1 -0
- wandb/apis/public/files.py +1 -0
- wandb/apis/public/history.py +1 -0
- wandb/apis/public/jobs.py +1 -0
- wandb/apis/public/projects.py +1 -0
- wandb/apis/public/reports.py +1 -0
- wandb/apis/public/runs.py +1 -0
- wandb/apis/public/sweeps.py +1 -0
- wandb/apis/public/teams.py +1 -0
- wandb/apis/public/users.py +1 -0
- wandb/apis/reports/v1/_blocks.py +2 -6
- wandb/apis/reports/v2/gql.py +1 -0
- wandb/apis/reports/v2/interface.py +3 -4
- wandb/apis/reports/v2/internal.py +5 -8
- wandb/cli/cli.py +7 -4
- wandb/data_types.py +3 -3
- wandb/env.py +35 -5
- wandb/errors/__init__.py +5 -0
- wandb/integration/catboost/catboost.py +1 -1
- wandb/integration/fastai/__init__.py +1 -0
- wandb/integration/keras/__init__.py +1 -0
- wandb/integration/keras/keras.py +6 -6
- wandb/integration/langchain/wandb_tracer.py +1 -0
- wandb/integration/lightning/fabric/logger.py +1 -3
- wandb/integration/metaflow/metaflow.py +41 -6
- wandb/integration/openai/fine_tuning.py +77 -40
- wandb/keras/__init__.py +1 -0
- wandb/proto/v3/wandb_internal_pb2.py +364 -332
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_internal_pb2.py +322 -316
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/wandb_internal_codegen.py +0 -25
- wandb/sdk/artifacts/artifact.py +41 -13
- wandb/sdk/artifacts/artifact_download_logger.py +1 -0
- wandb/sdk/artifacts/artifact_file_cache.py +18 -4
- wandb/sdk/artifacts/artifact_instance_cache.py +1 -0
- wandb/sdk/artifacts/artifact_manifest.py +1 -0
- wandb/sdk/artifacts/artifact_manifest_entry.py +1 -0
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
- wandb/sdk/artifacts/artifact_saver.py +21 -21
- wandb/sdk/artifacts/artifact_state.py +1 -0
- wandb/sdk/artifacts/artifact_ttl.py +1 -0
- wandb/sdk/artifacts/exceptions.py +1 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +13 -18
- wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +5 -3
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +1 -0
- wandb/sdk/artifacts/storage_policy.py +1 -0
- wandb/sdk/data_types/base_types/media.py +3 -6
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
- wandb/sdk/integration_utils/auto_logging.py +5 -6
- wandb/sdk/integration_utils/data_logging.py +5 -1
- wandb/sdk/interface/interface.py +72 -37
- wandb/sdk/interface/interface_shared.py +7 -13
- wandb/sdk/internal/datastore.py +1 -1
- wandb/sdk/internal/handler.py +18 -2
- wandb/sdk/internal/internal.py +0 -1
- wandb/sdk/internal/internal_util.py +0 -1
- wandb/sdk/internal/job_builder.py +4 -3
- wandb/sdk/internal/profiler.py +1 -0
- wandb/sdk/internal/run.py +1 -0
- wandb/sdk/internal/sender.py +1 -1
- wandb/sdk/internal/system/assets/gpu_amd.py +44 -44
- wandb/sdk/internal/system/assets/gpu_apple.py +56 -11
- wandb/sdk/internal/system/assets/interfaces.py +6 -8
- wandb/sdk/internal/system/assets/open_metrics.py +2 -2
- wandb/sdk/internal/system/assets/trainium.py +1 -3
- wandb/sdk/launch/_launch.py +5 -0
- wandb/sdk/launch/_project_spec.py +10 -23
- wandb/sdk/launch/agent/agent.py +81 -37
- wandb/sdk/launch/agent/config.py +80 -11
- wandb/sdk/launch/builder/abstract.py +1 -0
- wandb/sdk/launch/builder/build.py +28 -1
- wandb/sdk/launch/builder/docker_builder.py +1 -0
- wandb/sdk/launch/builder/kaniko_builder.py +149 -134
- wandb/sdk/launch/builder/noop.py +1 -0
- wandb/sdk/launch/create_job.py +61 -48
- wandb/sdk/launch/environment/abstract.py +1 -0
- wandb/sdk/launch/environment/gcp_environment.py +1 -0
- wandb/sdk/launch/environment/local_environment.py +1 -0
- wandb/sdk/launch/loader.py +1 -0
- wandb/sdk/launch/registry/abstract.py +1 -0
- wandb/sdk/launch/registry/azure_container_registry.py +1 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +1 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +1 -0
- wandb/sdk/launch/registry/local_registry.py +1 -0
- wandb/sdk/launch/runner/abstract.py +1 -0
- wandb/sdk/launch/runner/kubernetes_monitor.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +4 -3
- wandb/sdk/launch/runner/sagemaker_runner.py +11 -10
- wandb/sdk/launch/sweeps/scheduler.py +4 -1
- wandb/sdk/launch/sweeps/scheduler_sweep.py +1 -0
- wandb/sdk/launch/sweeps/utils.py +1 -1
- wandb/sdk/launch/utils.py +21 -3
- wandb/sdk/lib/_settings_toposort_generated.py +1 -0
- wandb/sdk/lib/fsm.py +8 -12
- wandb/sdk/lib/gitlib.py +4 -4
- wandb/sdk/lib/lazyloader.py +0 -1
- wandb/sdk/lib/proto_util.py +1 -1
- wandb/sdk/lib/retry.py +3 -2
- wandb/sdk/lib/run_moment.py +7 -1
- wandb/sdk/service/service.py +17 -15
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +2 -8
- wandb/sdk/wandb_manager.py +2 -2
- wandb/sdk/wandb_require.py +5 -0
- wandb/sdk/wandb_run.py +64 -46
- wandb/sdk/wandb_settings.py +2 -1
- wandb/sklearn/__init__.py +1 -0
- wandb/sklearn/plot/__init__.py +1 -0
- wandb/sklearn/plot/classifier.py +1 -0
- wandb/sklearn/plot/clusterer.py +1 -0
- wandb/sklearn/plot/regressor.py +1 -0
- wandb/sklearn/plot/shared.py +1 -0
- wandb/sklearn/utils.py +1 -0
- wandb/testing/relay.py +4 -4
- wandb/trigger.py +1 -0
- wandb/util.py +40 -17
- wandb/wandb_controller.py +0 -1
- wandb/wandb_torch.py +1 -2
- {wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/METADATA +68 -69
- {wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/RECORD +139 -140
- {wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/WHEEL +1 -2
- wandb/bin/apple_gpu_stats +0 -0
- wandb-0.16.5.dist-info/top_level.txt +0 -1
- {wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
import copy
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import os
|
@@ -8,7 +9,7 @@ import tarfile
|
|
8
9
|
import tempfile
|
9
10
|
import time
|
10
11
|
import traceback
|
11
|
-
from typing import Optional
|
12
|
+
from typing import Any, Dict, Optional
|
12
13
|
|
13
14
|
import wandb
|
14
15
|
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
@@ -105,6 +106,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
105
106
|
secret_name: str = "",
|
106
107
|
secret_key: str = "",
|
107
108
|
image: str = "gcr.io/kaniko-project/executor:v1.11.0",
|
109
|
+
config: Optional[dict] = None,
|
108
110
|
):
|
109
111
|
"""Initialize a KanikoBuilder.
|
110
112
|
|
@@ -125,6 +127,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
125
127
|
self.secret_name = secret_name
|
126
128
|
self.secret_key = secret_key
|
127
129
|
self.image = image
|
130
|
+
self.kaniko_config = config or {}
|
128
131
|
|
129
132
|
@classmethod
|
130
133
|
def from_config(
|
@@ -170,6 +173,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
170
173
|
image_uri = config.get("destination")
|
171
174
|
if image_uri is not None:
|
172
175
|
registry = registry_from_uri(image_uri)
|
176
|
+
kaniko_config = config.get("kaniko-config", {})
|
173
177
|
|
174
178
|
return cls(
|
175
179
|
environment,
|
@@ -179,6 +183,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
179
183
|
secret_name=secret_name,
|
180
184
|
secret_key=secret_key,
|
181
185
|
image=kaniko_image,
|
186
|
+
config=kaniko_config,
|
182
187
|
)
|
183
188
|
|
184
189
|
async def verify(self) -> None:
|
@@ -289,7 +294,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
289
294
|
|
290
295
|
build_context = await self._upload_build_context(run_id, context_path)
|
291
296
|
build_job = await self._create_kaniko_job(
|
292
|
-
build_job_name, repo_uri, image_uri, build_context, core_v1
|
297
|
+
build_job_name, repo_uri, image_uri, build_context, core_v1, api_client
|
293
298
|
)
|
294
299
|
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
295
300
|
|
@@ -324,7 +329,9 @@ class KanikoBuilder(AbstractBuilder):
|
|
324
329
|
):
|
325
330
|
if job_tracker:
|
326
331
|
job_tracker.set_err_stage("build")
|
327
|
-
raise Exception(
|
332
|
+
raise Exception(
|
333
|
+
f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
|
334
|
+
)
|
328
335
|
try:
|
329
336
|
pods_from_job = await core_v1.list_namespaced_pod(
|
330
337
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
@@ -371,23 +378,32 @@ class KanikoBuilder(AbstractBuilder):
|
|
371
378
|
image_tag: str,
|
372
379
|
build_context_path: str,
|
373
380
|
core_client: client.CoreV1Api,
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
381
|
+
api_client,
|
382
|
+
) -> Dict[str, Any]:
|
383
|
+
job = copy.deepcopy(self.kaniko_config)
|
384
|
+
job_metadata = job.get("metadata", {})
|
385
|
+
job_labels = job_metadata.get("labels", {})
|
386
|
+
job_spec = job.get("spec", {})
|
387
|
+
pod_template = job_spec.get("template", {})
|
388
|
+
pod_metadata = pod_template.get("metadata", {})
|
389
|
+
pod_labels = pod_metadata.get("labels", {})
|
390
|
+
pod_spec = pod_template.get("spec", {})
|
391
|
+
volumes = pod_spec.get("volumes", [])
|
392
|
+
containers = pod_spec.get("containers") or [{}]
|
393
|
+
if len(containers) > 1:
|
394
|
+
raise LaunchError(
|
395
|
+
"Multiple container configs not supported for kaniko builder."
|
396
|
+
)
|
397
|
+
container = containers[0]
|
398
|
+
volume_mounts = container.get("volumeMounts", [])
|
399
|
+
env = container.get("env", [])
|
400
|
+
custom_args = container.get("args", [])
|
378
401
|
|
379
402
|
if PVC_MOUNT_PATH:
|
380
403
|
volumes.append(
|
381
|
-
|
382
|
-
name="kaniko-pvc",
|
383
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
384
|
-
claim_name=PVC_NAME
|
385
|
-
),
|
386
|
-
)
|
387
|
-
)
|
388
|
-
volume_mounts.append(
|
389
|
-
client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
|
404
|
+
{"name": "kaniko-pvc", "persistentVolumeClaim": {"claimName": PVC_NAME}}
|
390
405
|
)
|
406
|
+
volume_mounts.append({"name": "kaniko-pvc", "mountPath": "/context"})
|
391
407
|
|
392
408
|
if bool(self.secret_name) != bool(self.secret_key):
|
393
409
|
raise LaunchError(
|
@@ -395,13 +411,13 @@ class KanikoBuilder(AbstractBuilder):
|
|
395
411
|
"for kaniko build. You provided only one of them."
|
396
412
|
)
|
397
413
|
if isinstance(self.registry, ElasticContainerRegistry):
|
398
|
-
env
|
399
|
-
|
400
|
-
name
|
401
|
-
value
|
402
|
-
|
403
|
-
|
404
|
-
# TODO: Refactor all of this environment/registry
|
414
|
+
env.append(
|
415
|
+
{
|
416
|
+
"name": "AWS_REGION",
|
417
|
+
"value": self.registry.region,
|
418
|
+
}
|
419
|
+
)
|
420
|
+
# TODO(ben): Refactor all of this environment/registry
|
405
421
|
# specific stuff into methods of those classes.
|
406
422
|
if isinstance(self.environment, AzureEnvironment):
|
407
423
|
# Use the core api to check if the secret exists
|
@@ -416,52 +432,46 @@ class KanikoBuilder(AbstractBuilder):
|
|
416
432
|
"namespace wandb. Please create it with the key password "
|
417
433
|
"set to your azure storage access key."
|
418
434
|
) from e
|
419
|
-
env
|
420
|
-
|
421
|
-
name
|
422
|
-
|
423
|
-
|
424
|
-
name
|
425
|
-
key
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
435
|
+
env.append(
|
436
|
+
{
|
437
|
+
"name": "AZURE_STORAGE_ACCESS_KEY",
|
438
|
+
"valueFrom": {
|
439
|
+
"secretKeyRef": {
|
440
|
+
"name": "azure-storage-access-key",
|
441
|
+
"key": "password",
|
442
|
+
}
|
443
|
+
},
|
444
|
+
}
|
445
|
+
)
|
430
446
|
if DOCKER_CONFIG_SECRET:
|
431
447
|
volumes.append(
|
432
|
-
|
433
|
-
name
|
434
|
-
secret
|
435
|
-
|
436
|
-
items
|
437
|
-
|
438
|
-
key
|
439
|
-
|
448
|
+
{
|
449
|
+
"name": "kaniko-docker-config",
|
450
|
+
"secret": {
|
451
|
+
"secretName": DOCKER_CONFIG_SECRET,
|
452
|
+
"items": [
|
453
|
+
{
|
454
|
+
"key": ".dockerconfigjson",
|
455
|
+
"path": "config.json",
|
456
|
+
}
|
440
457
|
],
|
441
|
-
|
442
|
-
|
458
|
+
},
|
459
|
+
}
|
443
460
|
)
|
444
461
|
volume_mounts.append(
|
445
|
-
|
446
|
-
name="kaniko-docker-config",
|
447
|
-
mount_path="/kaniko/.docker",
|
448
|
-
)
|
462
|
+
{"name": "kaniko-docker-config", "mountPath": "/kaniko/.docker"}
|
449
463
|
)
|
450
464
|
elif self.secret_name and self.secret_key:
|
451
|
-
volumes
|
452
|
-
|
453
|
-
name
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
name="docker-config", mount_path="/kaniko/.docker/"
|
462
|
-
),
|
463
|
-
]
|
464
|
-
# TODO: I don't like conditioning on the registry type here. As a
|
465
|
+
volumes.append(
|
466
|
+
{
|
467
|
+
"name": "docker-config",
|
468
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
469
|
+
}
|
470
|
+
)
|
471
|
+
volume_mounts.append(
|
472
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker"}
|
473
|
+
)
|
474
|
+
# TODO(ben): I don't like conditioning on the registry type here. As a
|
465
475
|
# future change I want the registry and environment classes to provide
|
466
476
|
# a list of environment variables and volume mounts that need to be
|
467
477
|
# added to the job. The environment class provides credentials for
|
@@ -475,90 +485,95 @@ class KanikoBuilder(AbstractBuilder):
|
|
475
485
|
elif isinstance(self.registry, GoogleArtifactRegistry):
|
476
486
|
mount_path = "/kaniko/.config/gcloud"
|
477
487
|
key = "config.json"
|
478
|
-
env
|
479
|
-
|
480
|
-
name
|
481
|
-
value
|
482
|
-
|
483
|
-
|
488
|
+
env.append(
|
489
|
+
{
|
490
|
+
"name": "GOOGLE_APPLICATION_CREDENTIALS",
|
491
|
+
"value": "/kaniko/.config/gcloud/config.json",
|
492
|
+
}
|
493
|
+
)
|
484
494
|
else:
|
485
495
|
raise LaunchError(
|
486
496
|
f"Registry type {type(self.registry)} not supported by kaniko"
|
487
497
|
)
|
488
|
-
|
489
|
-
|
490
|
-
name
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
498
|
+
volumes.append(
|
499
|
+
{
|
500
|
+
"name": self.secret_name,
|
501
|
+
"secret": {
|
502
|
+
"secretName": self.secret_name,
|
503
|
+
"items": [{"key": self.secret_key, "path": key}],
|
504
|
+
},
|
505
|
+
}
|
506
|
+
)
|
507
|
+
volume_mounts.append(
|
508
|
+
{
|
509
|
+
"name": self.secret_name,
|
510
|
+
"mountPath": mount_path,
|
511
|
+
"readOnly": True,
|
512
|
+
}
|
513
|
+
)
|
504
514
|
if isinstance(self.registry, AzureContainerRegistry):
|
505
|
-
#
|
506
|
-
|
507
|
-
|
508
|
-
name
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
name=f"docker-config-{job_name}",
|
516
|
-
),
|
517
|
-
),
|
518
|
-
]
|
515
|
+
# Add the docker config map
|
516
|
+
volumes.append(
|
517
|
+
{
|
518
|
+
"name": "docker-config",
|
519
|
+
"configMap": {"name": f"docker-config-{job_name}"},
|
520
|
+
}
|
521
|
+
)
|
522
|
+
volume_mounts.append(
|
523
|
+
{"name": "docker-config", "mountPath": "/kaniko/.docker/"}
|
524
|
+
)
|
519
525
|
# Kaniko doesn't want https:// at the begining of the image tag.
|
520
526
|
destination = image_tag
|
521
527
|
if destination.startswith("https://"):
|
522
528
|
destination = destination.replace("https://", "")
|
523
|
-
args =
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
"--cache
|
528
|
-
|
529
|
-
"--
|
530
|
-
"--compressed-caching
|
529
|
+
args = {
|
530
|
+
"--context": build_context_path,
|
531
|
+
"--dockerfile": _WANDB_DOCKERFILE_NAME,
|
532
|
+
"--destination": destination,
|
533
|
+
"--cache": "true",
|
534
|
+
"--cache-repo": repository.replace("https://", ""),
|
535
|
+
"--snapshot-mode": "redo",
|
536
|
+
"--compressed-caching": "false",
|
537
|
+
}
|
538
|
+
for custom_arg in custom_args:
|
539
|
+
arg_name, arg_value = custom_arg.split("=", 1)
|
540
|
+
args[arg_name] = arg_value
|
541
|
+
parsed_args = [
|
542
|
+
f"{arg_name}={arg_value}" for arg_name, arg_value in args.items()
|
531
543
|
]
|
532
|
-
container =
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
volume_mounts=volume_mounts,
|
537
|
-
env=env if env else None,
|
538
|
-
)
|
539
|
-
# Create and configure a spec section
|
540
|
-
labels = {"wandb": "launch"}
|
544
|
+
container["args"] = parsed_args
|
545
|
+
|
546
|
+
# Apply the rest of our defaults
|
547
|
+
pod_labels["wandb"] = "launch"
|
541
548
|
# This annotation is required to enable azure workload identity.
|
542
549
|
if isinstance(self.registry, AzureContainerRegistry):
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
restart_policy="Never",
|
548
|
-
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
549
|
-
containers=[container],
|
550
|
-
volumes=volumes,
|
551
|
-
service_account_name=SERVICE_ACCOUNT_NAME,
|
552
|
-
),
|
550
|
+
pod_labels["azure.workload.identity/use"] = "true"
|
551
|
+
pod_spec["restartPolicy"] = pod_spec.get("restartPolicy", "Never")
|
552
|
+
pod_spec["activeDeadlineSeconds"] = pod_spec.get(
|
553
|
+
"activeDeadlineSeconds", _DEFAULT_BUILD_TIMEOUT_SECS
|
553
554
|
)
|
554
|
-
|
555
|
-
|
556
|
-
job = client.V1Job(
|
557
|
-
api_version="batch/v1",
|
558
|
-
kind="Job",
|
559
|
-
metadata=client.V1ObjectMeta(
|
560
|
-
name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
|
561
|
-
),
|
562
|
-
spec=spec,
|
555
|
+
pod_spec["serviceAccountName"] = pod_spec.get(
|
556
|
+
"serviceAccountName", SERVICE_ACCOUNT_NAME
|
563
557
|
)
|
558
|
+
job_spec["backoffLimit"] = job_spec.get("backoffLimit", 0)
|
559
|
+
job_labels["wandb"] = "launch"
|
560
|
+
job_metadata["namespace"] = job_metadata.get("namespace", NAMESPACE)
|
561
|
+
job_metadata["name"] = job_metadata.get("name", job_name)
|
562
|
+
job["apiVersion"] = "batch/v1"
|
563
|
+
job["kind"] = "Job"
|
564
|
+
|
565
|
+
# Apply all nested configs from the bottom up
|
566
|
+
pod_metadata["labels"] = pod_labels
|
567
|
+
pod_template["metadata"] = pod_metadata
|
568
|
+
container["name"] = container.get("name", "wandb-container-build")
|
569
|
+
container["image"] = container.get("image", self.image)
|
570
|
+
container["volumeMounts"] = volume_mounts
|
571
|
+
container["env"] = env
|
572
|
+
pod_spec["containers"] = [container]
|
573
|
+
pod_spec["volumes"] = volumes
|
574
|
+
pod_template["spec"] = pod_spec
|
575
|
+
job_spec["template"] = pod_template
|
576
|
+
job_metadata["labels"] = job_labels
|
577
|
+
job["metadata"] = job_metadata
|
578
|
+
job["spec"] = job_spec
|
564
579
|
return job
|
wandb/sdk/launch/builder/noop.py
CHANGED
wandb/sdk/launch/create_job.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
+
import re
|
4
5
|
import sys
|
5
6
|
import tempfile
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple
|
@@ -11,7 +12,7 @@ from wandb.sdk.artifacts.artifact import Artifact
|
|
11
12
|
from wandb.sdk.internal.job_builder import JobBuilder
|
12
13
|
from wandb.sdk.launch.builder.build import get_current_python_version
|
13
14
|
from wandb.sdk.launch.git_reference import GitReference
|
14
|
-
from wandb.sdk.launch.utils import _is_git_uri
|
15
|
+
from wandb.sdk.launch.utils import _is_git_uri, get_entrypoint_file
|
15
16
|
from wandb.sdk.lib import filesystem
|
16
17
|
from wandb.util import make_artifact_name_safe
|
17
18
|
|
@@ -19,6 +20,9 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
19
20
|
_logger = logging.getLogger("wandb")
|
20
21
|
|
21
22
|
|
23
|
+
CODE_ARTIFACT_EXCLUDE_PATHS = ["wandb", ".git"]
|
24
|
+
|
25
|
+
|
22
26
|
def create_job(
|
23
27
|
path: str,
|
24
28
|
job_type: str,
|
@@ -107,6 +111,13 @@ def _create_job(
|
|
107
111
|
)
|
108
112
|
return None, "", []
|
109
113
|
|
114
|
+
if runtime is not None:
|
115
|
+
if not re.match(r"^3\.\d+$", runtime):
|
116
|
+
wandb.termerror(
|
117
|
+
f"Runtime (-r, --runtime) must be a minor version of Python 3, "
|
118
|
+
f"e.g. 3.9 or 3.10, received {runtime}"
|
119
|
+
)
|
120
|
+
return None, "", []
|
110
121
|
aliases = aliases or []
|
111
122
|
tempdir = tempfile.TemporaryDirectory()
|
112
123
|
try:
|
@@ -145,6 +156,7 @@ def _create_job(
|
|
145
156
|
|
146
157
|
job_builder = _configure_job_builder_for_partial(tempdir.name, job_source=job_type)
|
147
158
|
if job_type == "code":
|
159
|
+
assert entrypoint is not None
|
148
160
|
job_name = _make_code_artifact(
|
149
161
|
api=api,
|
150
162
|
job_builder=job_builder,
|
@@ -233,7 +245,6 @@ def _make_metadata_for_partial_job(
|
|
233
245
|
return metadata, None
|
234
246
|
|
235
247
|
if job_type == "code":
|
236
|
-
path, entrypoint = _handle_artifact_entrypoint(path, entrypoint)
|
237
248
|
if not entrypoint:
|
238
249
|
wandb.termerror(
|
239
250
|
"Artifact jobs must have an entrypoint, either included in the path or specified with -E"
|
@@ -304,15 +315,22 @@ def _create_repo_metadata(
|
|
304
315
|
with open(os.path.join(local_dir, ".python-version")) as f:
|
305
316
|
python_version = f.read().strip().splitlines()[0]
|
306
317
|
else:
|
307
|
-
|
308
|
-
python_version = f"{major}.{minor}"
|
318
|
+
_, python_version = get_current_python_version()
|
309
319
|
|
310
320
|
python_version = _clean_python_version(python_version)
|
311
321
|
|
312
322
|
# check if entrypoint is valid
|
313
323
|
assert entrypoint is not None
|
314
|
-
|
315
|
-
|
324
|
+
entrypoint_list = entrypoint.split(" ")
|
325
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
326
|
+
if not entrypoint_file:
|
327
|
+
wandb.termerror(
|
328
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
329
|
+
)
|
330
|
+
return None
|
331
|
+
|
332
|
+
if not os.path.exists(os.path.join(local_dir, entrypoint_file)):
|
333
|
+
wandb.termerror(f"Entrypoint file {entrypoint_file} not found in git repo")
|
316
334
|
return None
|
317
335
|
|
318
336
|
metadata = {
|
@@ -320,9 +338,9 @@ def _create_repo_metadata(
|
|
320
338
|
"commit": commit,
|
321
339
|
"remote": ref.url,
|
322
340
|
},
|
323
|
-
"codePathLocal":
|
324
|
-
"codePath":
|
325
|
-
"entrypoint":
|
341
|
+
"codePathLocal": entrypoint_file, # not in git context, optionally also set local
|
342
|
+
"codePath": entrypoint_file,
|
343
|
+
"entrypoint": entrypoint_list,
|
326
344
|
"python": python_version, # used to build container
|
327
345
|
"notebook": False, # partial jobs from notebooks not supported
|
328
346
|
}
|
@@ -332,10 +350,17 @@ def _create_repo_metadata(
|
|
332
350
|
|
333
351
|
def _create_artifact_metadata(
|
334
352
|
path: str, entrypoint: str, runtime: Optional[str] = None
|
335
|
-
) -> Tuple[Dict[str, Any], List[str]]:
|
353
|
+
) -> Tuple[Optional[Dict[str, Any]], Optional[List[str]]]:
|
336
354
|
if not os.path.isdir(path):
|
337
355
|
wandb.termerror("Path must be a valid file or directory")
|
338
356
|
return {}, []
|
357
|
+
entrypoint_list = entrypoint.split(" ")
|
358
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
359
|
+
if not entrypoint_file:
|
360
|
+
wandb.termerror(
|
361
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
362
|
+
)
|
363
|
+
return None, None
|
339
364
|
|
340
365
|
# read local requirements.txt and dump to temp dir for builder
|
341
366
|
requirements = []
|
@@ -347,41 +372,17 @@ def _create_artifact_metadata(
|
|
347
372
|
if runtime:
|
348
373
|
python_version = _clean_python_version(runtime)
|
349
374
|
else:
|
350
|
-
python_version =
|
375
|
+
python_version, _ = get_current_python_version()
|
376
|
+
python_version = _clean_python_version(python_version)
|
351
377
|
|
352
|
-
metadata = {
|
378
|
+
metadata = {
|
379
|
+
"python": python_version,
|
380
|
+
"codePath": entrypoint_file,
|
381
|
+
"entrypoint": entrypoint_list,
|
382
|
+
}
|
353
383
|
return metadata, requirements
|
354
384
|
|
355
385
|
|
356
|
-
def _handle_artifact_entrypoint(
|
357
|
-
path: str, entrypoint: Optional[str] = None
|
358
|
-
) -> Tuple[str, Optional[str]]:
|
359
|
-
if os.path.isfile(path):
|
360
|
-
if entrypoint and path.endswith(entrypoint):
|
361
|
-
path = path.replace(entrypoint, "")
|
362
|
-
wandb.termwarn(
|
363
|
-
f"Both entrypoint provided and path contains file. Using provided entrypoint: {entrypoint}, path is now: {path}"
|
364
|
-
)
|
365
|
-
elif entrypoint:
|
366
|
-
wandb.termwarn(
|
367
|
-
f"Ignoring passed in entrypoint as it does not match file path found in 'path'. Path entrypoint: {path.split('/')[-1]}"
|
368
|
-
)
|
369
|
-
entrypoint = path.split("/")[-1]
|
370
|
-
path = "/".join(path.split("/")[:-1])
|
371
|
-
elif not entrypoint:
|
372
|
-
wandb.termerror("Entrypoint not valid")
|
373
|
-
return "", None
|
374
|
-
path = path or "." # when path is just an entrypoint, use cdw
|
375
|
-
|
376
|
-
if not os.path.exists(os.path.join(path, entrypoint)):
|
377
|
-
wandb.termerror(
|
378
|
-
f"Could not find execution point: {os.path.join(path, entrypoint)}"
|
379
|
-
)
|
380
|
-
return "", None
|
381
|
-
|
382
|
-
return path, entrypoint
|
383
|
-
|
384
|
-
|
385
386
|
def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuilder:
|
386
387
|
"""Configure job builder with temp dir and job source."""
|
387
388
|
# adjust git source to repo
|
@@ -411,7 +412,7 @@ def _make_code_artifact(
|
|
411
412
|
job_builder: JobBuilder,
|
412
413
|
run: "wandb.sdk.wandb_run.Run",
|
413
414
|
path: str,
|
414
|
-
entrypoint:
|
415
|
+
entrypoint: str,
|
415
416
|
entity: Optional[str],
|
416
417
|
project: Optional[str],
|
417
418
|
name: Optional[str],
|
@@ -420,17 +421,22 @@ def _make_code_artifact(
|
|
420
421
|
|
421
422
|
Returns the name of the eventual job.
|
422
423
|
"""
|
423
|
-
|
424
|
+
assert entrypoint is not None
|
425
|
+
entrypoint_list = entrypoint.split(" ")
|
426
|
+
entrypoint_file = get_entrypoint_file(entrypoint_list)
|
427
|
+
if not entrypoint_file:
|
428
|
+
wandb.termerror(
|
429
|
+
f"Entrypoint {entrypoint} is invalid. An entrypoint should include both an executable and a file, for example 'python train.py'"
|
430
|
+
)
|
431
|
+
return None
|
432
|
+
|
433
|
+
artifact_name = _make_code_artifact_name(os.path.join(path, entrypoint_file), name)
|
424
434
|
code_artifact = wandb.Artifact(
|
425
435
|
name=artifact_name,
|
426
436
|
type="code",
|
427
437
|
description="Code artifact for job",
|
428
438
|
)
|
429
439
|
|
430
|
-
# Update path and entrypoint vars to match metadata
|
431
|
-
# TODO(gst): consolidate into one place
|
432
|
-
path, entrypoint = _handle_artifact_entrypoint(path, entrypoint)
|
433
|
-
|
434
440
|
try:
|
435
441
|
code_artifact.add_dir(path)
|
436
442
|
except Exception as e:
|
@@ -441,6 +447,13 @@ def _make_code_artifact(
|
|
441
447
|
wandb.termerror(f"Error adding to code artifact: {e}")
|
442
448
|
return None
|
443
449
|
|
450
|
+
# Remove paths we don't want to include, if present
|
451
|
+
for item in CODE_ARTIFACT_EXCLUDE_PATHS:
|
452
|
+
try:
|
453
|
+
code_artifact.remove(item)
|
454
|
+
except FileNotFoundError:
|
455
|
+
pass
|
456
|
+
|
444
457
|
res, _ = api.create_artifact(
|
445
458
|
artifact_type_name="code",
|
446
459
|
artifact_collection_name=artifact_name,
|
@@ -451,7 +464,7 @@ def _make_code_artifact(
|
|
451
464
|
project_name=project,
|
452
465
|
run_name=run.id, # run will be deleted after creation
|
453
466
|
description="Code artifact for job",
|
454
|
-
metadata={"codePath": path, "entrypoint":
|
467
|
+
metadata={"codePath": path, "entrypoint": entrypoint_file},
|
455
468
|
is_user_created=True,
|
456
469
|
aliases=[
|
457
470
|
{"artifactCollectionName": artifact_name, "alias": a} for a in ["latest"]
|
wandb/sdk/launch/loader.py
CHANGED