wandb 0.17.5__py3-none-any.whl → 0.17.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +5 -16
- wandb/agents/pyagent.py +1 -2
- wandb/apis/public/api.py +1 -1
- wandb/apis/public/jobs.py +5 -0
- wandb/bin/nvidia_gpu_stats +0 -0
- wandb/cli/cli.py +21 -0
- wandb/data_types.py +5 -4
- wandb/env.py +6 -0
- wandb/integration/kfp/wandb_logging.py +1 -1
- wandb/integration/lightning/fabric/logger.py +5 -5
- wandb/integration/openai/fine_tuning.py +13 -5
- wandb/integration/ultralytics/pose_utils.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +226 -226
- wandb/proto/v3/wandb_settings_pb2.py +1 -1
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +226 -226
- wandb/proto/v4/wandb_settings_pb2.py +1 -1
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_internal_pb2.py +226 -226
- wandb/proto/v5/wandb_settings_pb2.py +1 -1
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/wandb_deprecated.py +4 -0
- wandb/proto/wandb_internal_pb2.py +6 -0
- wandb/sdk/artifacts/artifact.py +6 -1
- wandb/sdk/artifacts/artifact_manifest_entry.py +31 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +35 -23
- wandb/sdk/data_types/_dtypes.py +5 -5
- wandb/sdk/data_types/base_types/media.py +3 -1
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
- wandb/sdk/data_types/helper_types/image_mask.py +3 -1
- wandb/sdk/data_types/image.py +3 -1
- wandb/sdk/data_types/object_3d.py +113 -2
- wandb/sdk/data_types/saved_model.py +3 -1
- wandb/sdk/interface/interface.py +40 -16
- wandb/sdk/interface/interface_shared.py +6 -9
- wandb/sdk/internal/datastore.py +1 -1
- wandb/sdk/internal/handler.py +0 -2
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/job_builder.py +5 -2
- wandb/sdk/internal/sender.py +31 -15
- wandb/sdk/internal/tb_watcher.py +2 -2
- wandb/sdk/internal/update.py +2 -2
- wandb/sdk/launch/_launch.py +4 -2
- wandb/sdk/launch/_project_spec.py +34 -8
- wandb/sdk/launch/agent/agent.py +6 -2
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -4
- wandb/sdk/launch/builder/build.py +4 -2
- wandb/sdk/launch/builder/kaniko_builder.py +13 -5
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +2 -1
- wandb/sdk/launch/create_job.py +2 -0
- wandb/sdk/launch/inputs/internal.py +42 -28
- wandb/sdk/launch/inputs/schema.py +39 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +72 -0
- wandb/sdk/launch/runner/local_container.py +13 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +3 -5
- wandb/sdk/launch/utils.py +2 -0
- wandb/sdk/lib/apikey.py +1 -1
- wandb/sdk/lib/disabled.py +13 -174
- wandb/sdk/service/streams.py +2 -4
- wandb/sdk/wandb_config.py +1 -1
- wandb/sdk/wandb_init.py +77 -33
- wandb/sdk/wandb_login.py +6 -6
- wandb/sdk/wandb_run.py +150 -90
- wandb/sdk/wandb_settings.py +4 -3
- wandb/sdk/wandb_setup.py +66 -3
- wandb/sdk/wandb_sweep.py +5 -2
- wandb/wandb_agent.py +2 -0
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/METADATA +3 -2
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/RECORD +72 -70
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/WHEEL +0 -0
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/entry_points.txt +0 -0
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/interface/interface.py
CHANGED
@@ -8,11 +8,14 @@ InterfaceRelay: Responses are routed to a relay queue (not matching uuids)
|
|
8
8
|
|
9
9
|
"""
|
10
10
|
|
11
|
+
import gzip
|
11
12
|
import logging
|
12
13
|
import os
|
13
14
|
import sys
|
14
15
|
import time
|
15
16
|
from abc import abstractmethod
|
17
|
+
from pathlib import Path
|
18
|
+
from secrets import token_hex
|
16
19
|
from typing import (
|
17
20
|
TYPE_CHECKING,
|
18
21
|
Any,
|
@@ -47,20 +50,24 @@ from ..lib.mailbox import MailboxHandle
|
|
47
50
|
from . import summary_record as sr
|
48
51
|
from .message_future import MessageFuture
|
49
52
|
|
53
|
+
MANIFEST_FILE_SIZE_THRESHOLD = 100_000
|
54
|
+
|
50
55
|
GlobStr = NewType("GlobStr", str)
|
51
56
|
|
52
|
-
if
|
53
|
-
from
|
57
|
+
if sys.version_info >= (3, 8):
|
58
|
+
from typing import Literal, TypedDict
|
59
|
+
else:
|
60
|
+
from typing_extensions import Literal, TypedDict
|
54
61
|
|
55
|
-
|
56
|
-
from typing import Literal, TypedDict
|
57
|
-
else:
|
58
|
-
from typing_extensions import Literal, TypedDict
|
62
|
+
PolicyName = Literal["now", "live", "end"]
|
59
63
|
|
60
|
-
PolicyName = Literal["now", "live", "end"]
|
61
64
|
|
62
|
-
|
63
|
-
|
65
|
+
class FilesDict(TypedDict):
|
66
|
+
files: Iterable[Tuple[GlobStr, PolicyName]]
|
67
|
+
|
68
|
+
|
69
|
+
if TYPE_CHECKING:
|
70
|
+
from ..wandb_run import Run
|
64
71
|
|
65
72
|
|
66
73
|
logger = logging.getLogger("wandb")
|
@@ -107,15 +114,14 @@ class InterfaceBase:
|
|
107
114
|
def _publish_header(self, header: pb.HeaderRecord) -> None:
|
108
115
|
raise NotImplementedError
|
109
116
|
|
110
|
-
def
|
111
|
-
|
112
|
-
resp = self._communicate_status(status)
|
113
|
-
return resp
|
117
|
+
def deliver_status(self) -> MailboxHandle:
|
118
|
+
return self._deliver_status(pb.StatusRequest())
|
114
119
|
|
115
120
|
@abstractmethod
|
116
|
-
def
|
117
|
-
self,
|
118
|
-
|
121
|
+
def _deliver_status(
|
122
|
+
self,
|
123
|
+
status: pb.StatusRequest,
|
124
|
+
) -> MailboxHandle:
|
119
125
|
raise NotImplementedError
|
120
126
|
|
121
127
|
def _make_config(
|
@@ -334,6 +340,12 @@ class InterfaceBase:
|
|
334
340
|
proto_manifest.version = artifact_manifest.version()
|
335
341
|
proto_manifest.storage_policy = artifact_manifest.storage_policy.name()
|
336
342
|
|
343
|
+
# Very large manifests need to be written to file to avoid protobuf size limits.
|
344
|
+
if len(artifact_manifest) > MANIFEST_FILE_SIZE_THRESHOLD:
|
345
|
+
path = self._write_artifact_manifest_file(artifact_manifest)
|
346
|
+
proto_manifest.manifest_file_path = path
|
347
|
+
return proto_manifest
|
348
|
+
|
337
349
|
for k, v in artifact_manifest.storage_policy.config().items() or {}.items():
|
338
350
|
cfg = proto_manifest.storage_policy_config.add()
|
339
351
|
cfg.key = k
|
@@ -358,6 +370,18 @@ class InterfaceBase:
|
|
358
370
|
proto_extra.value_json = json.dumps(v)
|
359
371
|
return proto_manifest
|
360
372
|
|
373
|
+
def _write_artifact_manifest_file(self, manifest: ArtifactManifest) -> str:
|
374
|
+
manifest_dir = Path(get_staging_dir()) / "artifact_manifests"
|
375
|
+
manifest_dir.mkdir(parents=True, exist_ok=True)
|
376
|
+
# It would be simpler to use `manifest.to_json()`, but that gets very slow for
|
377
|
+
# large manifests since it encodes the whole thing as a single JSON object.
|
378
|
+
filename = f"{time.time()}_{token_hex(8)}.manifest_contents.jl.gz"
|
379
|
+
manifest_file_path = manifest_dir / filename
|
380
|
+
with gzip.open(manifest_file_path, mode="wt", compresslevel=1) as f:
|
381
|
+
for entry in manifest.entries.values():
|
382
|
+
f.write(f"{json.dumps(entry.to_json())}\n")
|
383
|
+
return str(manifest_file_path)
|
384
|
+
|
361
385
|
def deliver_link_artifact(
|
362
386
|
self,
|
363
387
|
run: "Run",
|
@@ -299,7 +299,7 @@ class InterfaceShared(InterfaceBase):
|
|
299
299
|
raise NotImplementedError
|
300
300
|
|
301
301
|
def _communicate(
|
302
|
-
self, rec: pb.Record, timeout: Optional[int] =
|
302
|
+
self, rec: pb.Record, timeout: Optional[int] = 30, local: Optional[bool] = None
|
303
303
|
) -> Optional[pb.Result]:
|
304
304
|
return self._communicate_async(rec, local=local).get(timeout=timeout)
|
305
305
|
|
@@ -421,15 +421,12 @@ class InterfaceShared(InterfaceBase):
|
|
421
421
|
rec = self._make_record(alert=proto_alert)
|
422
422
|
self._publish(rec)
|
423
423
|
|
424
|
-
def
|
425
|
-
self,
|
426
|
-
|
424
|
+
def _deliver_status(
|
425
|
+
self,
|
426
|
+
status: pb.StatusRequest,
|
427
|
+
) -> MailboxHandle:
|
427
428
|
req = self._make_request(status=status)
|
428
|
-
|
429
|
-
if resp is None:
|
430
|
-
return None
|
431
|
-
assert resp.response.status_response
|
432
|
-
return resp.response.status_response
|
429
|
+
return self._deliver_record(req)
|
433
430
|
|
434
431
|
def _publish_exit(self, exit_data: pb.RunExitRecord) -> None:
|
435
432
|
rec = self._make_record(exit=exit_data)
|
wandb/sdk/internal/datastore.py
CHANGED
wandb/sdk/internal/handler.py
CHANGED
@@ -745,8 +745,6 @@ class HandleManager:
|
|
745
745
|
self._respond_result(result)
|
746
746
|
|
747
747
|
def handle_request_status(self, record: Record) -> None:
|
748
|
-
# TODO(mempressure): do something better?
|
749
|
-
assert record.control.req_resp
|
750
748
|
result = proto_util._result_from_record(record)
|
751
749
|
self._respond_result(result)
|
752
750
|
|
wandb/sdk/internal/internal.py
CHANGED
@@ -423,15 +423,18 @@ class JobBuilder:
|
|
423
423
|
api: Api,
|
424
424
|
build_context: Optional[str] = None,
|
425
425
|
dockerfile: Optional[str] = None,
|
426
|
+
base_image: Optional[str] = None,
|
426
427
|
) -> Optional[Artifact]:
|
427
428
|
"""Build a job artifact from the current run.
|
428
429
|
|
429
430
|
Arguments:
|
431
|
+
api (Api): The API object to use to create the job artifact.
|
430
432
|
build_context (Optional[str]): Path within the job source code to
|
431
433
|
the image build context. Saved as part of the job for future
|
432
434
|
builds.
|
433
435
|
dockerfile (Optional[str]): Path within the build context the
|
434
436
|
Dockerfile. Saved as part of the job for future builds.
|
437
|
+
base_image (Optional[str]): The base image used to run the job code.
|
435
438
|
|
436
439
|
Returns:
|
437
440
|
Optional[Artifact]: The job artifact if it was successfully built,
|
@@ -467,8 +470,6 @@ class JobBuilder:
|
|
467
470
|
"warn",
|
468
471
|
)
|
469
472
|
return None
|
470
|
-
metadata["dockerfile"] = dockerfile
|
471
|
-
metadata["build_context"] = build_context
|
472
473
|
|
473
474
|
runtime: Optional[str] = metadata.get("python")
|
474
475
|
# can't build a job without a python version
|
@@ -520,6 +521,8 @@ class JobBuilder:
|
|
520
521
|
source["build_context"] = build_context # type: ignore[typeddict-item]
|
521
522
|
if dockerfile:
|
522
523
|
source["dockerfile"] = dockerfile # type: ignore[typeddict-item]
|
524
|
+
if base_image:
|
525
|
+
source["base_image"] = base_image # type: ignore[typeddict-item]
|
523
526
|
|
524
527
|
# Pop any keys that are initialized to None. The current TypedDict
|
525
528
|
# system for source dicts requires all keys to be present, but we
|
wandb/sdk/internal/sender.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
"""sender."""
|
2
2
|
|
3
|
+
import contextlib
|
4
|
+
import gzip
|
3
5
|
import json
|
4
6
|
import logging
|
5
7
|
import os
|
@@ -66,6 +68,7 @@ else:
|
|
66
68
|
if TYPE_CHECKING:
|
67
69
|
from wandb.proto.wandb_internal_pb2 import (
|
68
70
|
ArtifactManifest,
|
71
|
+
ArtifactManifestEntry,
|
69
72
|
ArtifactRecord,
|
70
73
|
HttpResponse,
|
71
74
|
LocalInfo,
|
@@ -105,22 +108,18 @@ def _framework_priority() -> Generator[Tuple[str, str], None, None]:
|
|
105
108
|
|
106
109
|
def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
107
110
|
if manifest.version == 1:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
extra.key: json.loads(extra.value_json) for extra in content.extra
|
120
|
-
},
|
111
|
+
if manifest.manifest_file_path:
|
112
|
+
contents = {}
|
113
|
+
with gzip.open(manifest.manifest_file_path, "rt") as f:
|
114
|
+
for line in f:
|
115
|
+
entry_json = json.loads(line)
|
116
|
+
path = entry_json.pop("path")
|
117
|
+
contents[path] = entry_json
|
118
|
+
else:
|
119
|
+
contents = {
|
120
|
+
content.path: _manifest_entry_from_proto(content)
|
121
|
+
for content in manifest.contents
|
121
122
|
}
|
122
|
-
for content in manifest.contents
|
123
|
-
}
|
124
123
|
else:
|
125
124
|
raise ValueError(f"unknown artifact manifest version: {manifest.version}")
|
126
125
|
|
@@ -135,6 +134,19 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
|
135
134
|
}
|
136
135
|
|
137
136
|
|
137
|
+
def _manifest_entry_from_proto(entry: "ArtifactManifestEntry") -> Dict:
|
138
|
+
birth_artifact_id = entry.birth_artifact_id if entry.birth_artifact_id else None
|
139
|
+
return {
|
140
|
+
"digest": entry.digest,
|
141
|
+
"birthArtifactID": birth_artifact_id,
|
142
|
+
"ref": entry.ref if entry.ref else None,
|
143
|
+
"size": entry.size if entry.size is not None else None,
|
144
|
+
"local_path": entry.local_path if entry.local_path else None,
|
145
|
+
"skip_cache": entry.skip_cache,
|
146
|
+
"extra": {extra.key: json.loads(extra.value_json) for extra in entry.extra},
|
147
|
+
}
|
148
|
+
|
149
|
+
|
138
150
|
class ResumeState:
|
139
151
|
resumed: bool
|
140
152
|
step: int
|
@@ -1586,6 +1598,10 @@ class SendManager:
|
|
1586
1598
|
)
|
1587
1599
|
|
1588
1600
|
self._job_builder._handle_server_artifact(res, artifact)
|
1601
|
+
|
1602
|
+
if artifact.manifest.manifest_file_path:
|
1603
|
+
with contextlib.suppress(FileNotFoundError):
|
1604
|
+
os.remove(artifact.manifest.manifest_file_path)
|
1589
1605
|
return res
|
1590
1606
|
|
1591
1607
|
def send_alert(self, record: "Record") -> None:
|
wandb/sdk/internal/tb_watcher.py
CHANGED
@@ -123,7 +123,7 @@ class TBWatcher:
|
|
123
123
|
self._force = force
|
124
124
|
# TODO(jhr): do we need locking in this queue?
|
125
125
|
self._watcher_queue = queue.PriorityQueue()
|
126
|
-
wandb.tensorboard.reset_state()
|
126
|
+
wandb.tensorboard.reset_state() # type: ignore
|
127
127
|
|
128
128
|
def _calculate_namespace(self, logdir: str, rootdir: str) -> Optional[str]:
|
129
129
|
namespace: Optional[str]
|
@@ -430,7 +430,7 @@ class TBEventConsumer:
|
|
430
430
|
def _handle_event(
|
431
431
|
self, event: "ProtoEvent", history: Optional["TBHistory"] = None
|
432
432
|
) -> None:
|
433
|
-
wandb.tensorboard._log(
|
433
|
+
wandb.tensorboard._log( # type: ignore
|
434
434
|
event.event,
|
435
435
|
step=event.event.step,
|
436
436
|
namespace=event.namespace,
|
wandb/sdk/internal/update.py
CHANGED
@@ -10,7 +10,7 @@ def _find_available(
|
|
10
10
|
) -> Optional[Tuple[str, bool, bool, bool, Optional[str]]]:
|
11
11
|
from wandb.util import parse_version
|
12
12
|
|
13
|
-
pypi_url =
|
13
|
+
pypi_url = "https://pypi.org/pypi/wandb/json"
|
14
14
|
|
15
15
|
yanked_dict = {}
|
16
16
|
try:
|
@@ -78,7 +78,7 @@ def check_available(current_version: str) -> Optional[Dict[str, Optional[str]]]:
|
|
78
78
|
if not package_info:
|
79
79
|
return None
|
80
80
|
|
81
|
-
wandb_module_name = wandb
|
81
|
+
wandb_module_name = "wandb"
|
82
82
|
|
83
83
|
latest_version, pip_prerelease, deleted, yanked, yanked_reason = package_info
|
84
84
|
upgrade_message = (
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -211,7 +211,9 @@ async def _launch(
|
|
211
211
|
launch_project = LaunchProject.from_spec(launch_spec, api)
|
212
212
|
launch_project.fetch_and_validate_project()
|
213
213
|
entrypoint = launch_project.get_job_entry_point()
|
214
|
-
image_uri =
|
214
|
+
image_uri = (
|
215
|
+
launch_project.docker_image or launch_project.job_base_image
|
216
|
+
) # Either set by user or None.
|
215
217
|
|
216
218
|
# construct runner config.
|
217
219
|
runner_config: Dict[str, Any] = {}
|
@@ -224,7 +226,7 @@ async def _launch(
|
|
224
226
|
await environment.verify()
|
225
227
|
registry = loader.registry_from_config(registry_config, environment)
|
226
228
|
builder = loader.builder_from_config(build_config, environment, registry)
|
227
|
-
if not launch_project.docker_image:
|
229
|
+
if not (launch_project.docker_image or launch_project.job_base_image):
|
228
230
|
assert entrypoint
|
229
231
|
image_uri = await builder.build_image(launch_project, entrypoint, None)
|
230
232
|
backend = loader.runner_from_config(
|
@@ -7,6 +7,7 @@ import enum
|
|
7
7
|
import json
|
8
8
|
import logging
|
9
9
|
import os
|
10
|
+
import shutil
|
10
11
|
import tempfile
|
11
12
|
from copy import deepcopy
|
12
13
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
|
@@ -112,6 +113,9 @@ class LaunchProject:
|
|
112
113
|
self.sweep_id = sweep_id
|
113
114
|
self.author = launch_spec.get("author")
|
114
115
|
self.python_version: Optional[str] = launch_spec.get("python_version")
|
116
|
+
self._job_dockerfile: Optional[str] = None
|
117
|
+
self._job_build_context: Optional[str] = None
|
118
|
+
self._job_base_image: Optional[str] = None
|
115
119
|
self.accelerator_base_image: Optional[str] = resource_args_build.get(
|
116
120
|
"accelerator", {}
|
117
121
|
).get("base_image") or resource_args_build.get("cuda", {}).get("base_image")
|
@@ -131,8 +135,6 @@ class LaunchProject:
|
|
131
135
|
self._queue_name: Optional[str] = None
|
132
136
|
self._queue_entity: Optional[str] = None
|
133
137
|
self._run_queue_item_id: Optional[str] = None
|
134
|
-
self._job_dockerfile: Optional[str] = None
|
135
|
-
self._job_build_context: Optional[str] = None
|
136
138
|
|
137
139
|
def init_source(self) -> None:
|
138
140
|
if self.docker_image is not None:
|
@@ -146,6 +148,21 @@ class LaunchProject:
|
|
146
148
|
self.project_dir = os.getcwd()
|
147
149
|
self._entry_point = self.override_entrypoint
|
148
150
|
|
151
|
+
def change_project_dir(self, new_dir: str) -> None:
|
152
|
+
"""Change the project directory to a new directory."""
|
153
|
+
# Copy the contents of the old project dir to the new project dir.
|
154
|
+
old_dir = self.project_dir
|
155
|
+
if old_dir is not None:
|
156
|
+
shutil.copytree(
|
157
|
+
old_dir,
|
158
|
+
new_dir,
|
159
|
+
symlinks=True,
|
160
|
+
dirs_exist_ok=True,
|
161
|
+
ignore=shutil.ignore_patterns("fsmonitor--daemon.ipc", ".git"),
|
162
|
+
)
|
163
|
+
shutil.rmtree(old_dir)
|
164
|
+
self.project_dir = new_dir
|
165
|
+
|
149
166
|
def init_git(self, git_info: Dict[str, str]) -> None:
|
150
167
|
self.git_version = git_info.get("version")
|
151
168
|
self.git_repo = git_info.get("repo")
|
@@ -212,14 +229,23 @@ class LaunchProject:
|
|
212
229
|
def job_build_context(self) -> Optional[str]:
|
213
230
|
return self._job_build_context
|
214
231
|
|
232
|
+
@property
|
233
|
+
def job_base_image(self) -> Optional[str]:
|
234
|
+
return self._job_base_image
|
235
|
+
|
215
236
|
def set_job_dockerfile(self, dockerfile: str) -> None:
|
216
237
|
self._job_dockerfile = dockerfile
|
217
238
|
|
218
239
|
def set_job_build_context(self, build_context: str) -> None:
|
219
240
|
self._job_build_context = build_context
|
220
241
|
|
242
|
+
def set_job_base_image(self, base_image: str) -> None:
|
243
|
+
self._job_base_image = base_image
|
244
|
+
|
221
245
|
@property
|
222
246
|
def image_name(self) -> str:
|
247
|
+
if self.job_base_image is not None:
|
248
|
+
return self.job_base_image
|
223
249
|
if self.docker_image is not None:
|
224
250
|
return self.docker_image
|
225
251
|
elif self.uri is not None:
|
@@ -299,10 +325,8 @@ class LaunchProject:
|
|
299
325
|
|
300
326
|
def build_required(self) -> bool:
|
301
327
|
"""Checks the source to see if a build is required."""
|
302
|
-
|
303
|
-
|
304
|
-
# we don't need to build the image for a job if that tag
|
305
|
-
# already exists
|
328
|
+
if self.job_base_image is not None:
|
329
|
+
return False
|
306
330
|
if self.source != LaunchSource.JOB:
|
307
331
|
return True
|
308
332
|
return False
|
@@ -316,7 +340,9 @@ class LaunchProject:
|
|
316
340
|
Returns:
|
317
341
|
Optional[str]: The Docker image or None if not specified.
|
318
342
|
"""
|
319
|
-
|
343
|
+
if self._docker_image:
|
344
|
+
return self._docker_image
|
345
|
+
return None
|
320
346
|
|
321
347
|
@docker_image.setter
|
322
348
|
def docker_image(self, value: str) -> None:
|
@@ -336,7 +362,7 @@ class LaunchProject:
|
|
336
362
|
# assuming project only has 1 entry point, pull that out
|
337
363
|
# tmp fn until we figure out if we want to support multiple entry points or not
|
338
364
|
if not self._entry_point:
|
339
|
-
if not self.docker_image:
|
365
|
+
if not self.docker_image and not self.job_base_image:
|
340
366
|
raise LaunchError(
|
341
367
|
"Project must have at least one entry point unless docker image is specified."
|
342
368
|
)
|
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -717,7 +717,7 @@ class LaunchAgent:
|
|
717
717
|
_, build_config, registry_config = construct_agent_configs(
|
718
718
|
default_config, override_build_config
|
719
719
|
)
|
720
|
-
image_uri = project.docker_image
|
720
|
+
image_uri = project.docker_image or project.job_base_image
|
721
721
|
entrypoint = project.get_job_entry_point()
|
722
722
|
environment = loader.environment_from_config(
|
723
723
|
default_config.get("environment", {})
|
@@ -727,7 +727,11 @@ class LaunchAgent:
|
|
727
727
|
backend = loader.runner_from_config(
|
728
728
|
resource, api, backend_config, environment, registry
|
729
729
|
)
|
730
|
-
if not (
|
730
|
+
if not (
|
731
|
+
project.docker_image
|
732
|
+
or project.job_base_image
|
733
|
+
or isinstance(backend, LocalProcessRunner)
|
734
|
+
):
|
731
735
|
assert entrypoint is not None
|
732
736
|
image_uri = await builder.build_image(project, entrypoint, job_tracker)
|
733
737
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
from typing import List, Optional
|
5
|
+
from typing import List, Optional
|
6
6
|
|
7
7
|
import wandb
|
8
8
|
|
@@ -17,9 +17,7 @@ FileSubtypes = Literal["warning", "error"]
|
|
17
17
|
class RunQueueItemFileSaver:
|
18
18
|
def __init__(
|
19
19
|
self,
|
20
|
-
agent_run: Optional[
|
21
|
-
Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
|
22
|
-
],
|
20
|
+
agent_run: Optional["wandb.sdk.wandb_run.Run"],
|
23
21
|
run_queue_item_id: str,
|
24
22
|
):
|
25
23
|
self.run_queue_item_id = run_queue_item_id
|
@@ -201,7 +201,7 @@ def get_requirements_section(
|
|
201
201
|
# If there is a requirements.txt at root of build context, use that.
|
202
202
|
if (base_path / "src" / "requirements.txt").exists():
|
203
203
|
requirements_files += ["src/requirements.txt"]
|
204
|
-
deps_install_line = "pip install -r requirements.txt"
|
204
|
+
deps_install_line = "pip install uv && uv pip install -r requirements.txt"
|
205
205
|
with open(base_path / "src" / "requirements.txt") as f:
|
206
206
|
requirements = f.readlines()
|
207
207
|
if not any(["wandb" in r for r in requirements]):
|
@@ -237,7 +237,9 @@ def get_requirements_section(
|
|
237
237
|
with open(base_path / "src" / "requirements.txt", "w") as f:
|
238
238
|
f.write("\n".join(project_deps))
|
239
239
|
requirements_files += ["src/requirements.txt"]
|
240
|
-
deps_install_line =
|
240
|
+
deps_install_line = (
|
241
|
+
"pip install uv && uv pip install -r requirements.txt"
|
242
|
+
)
|
241
243
|
return PIP_TEMPLATE.format(
|
242
244
|
buildx_optional_prefix=prefix,
|
243
245
|
requirements_files=" ".join(requirements_files),
|
@@ -63,6 +63,13 @@ else:
|
|
63
63
|
NAMESPACE = "wandb"
|
64
64
|
|
65
65
|
|
66
|
+
def get_pod_name_safe(job: client.V1Job):
|
67
|
+
try:
|
68
|
+
return job.spec.template.metadata.name
|
69
|
+
except AttributeError:
|
70
|
+
return None
|
71
|
+
|
72
|
+
|
66
73
|
async def _wait_for_completion(
|
67
74
|
batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
|
68
75
|
) -> bool:
|
@@ -319,17 +326,18 @@ class KanikoBuilder(AbstractBuilder):
|
|
319
326
|
await self._create_docker_ecr_config_map(
|
320
327
|
build_job_name, core_v1, repo_uri
|
321
328
|
)
|
322
|
-
await batch_v1.create_namespaced_job(NAMESPACE, build_job)
|
323
|
-
|
329
|
+
k8s_job = await batch_v1.create_namespaced_job(NAMESPACE, build_job)
|
324
330
|
# wait for double the job deadline since it might take time to schedule
|
325
331
|
if not await _wait_for_completion(
|
326
332
|
batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
|
327
333
|
):
|
328
334
|
if job_tracker:
|
329
335
|
job_tracker.set_err_stage("build")
|
330
|
-
|
331
|
-
|
332
|
-
|
336
|
+
msg = f"Failed to build image in kaniko for job {run_id}."
|
337
|
+
pod_name = get_pod_name_safe(k8s_job)
|
338
|
+
if pod_name:
|
339
|
+
msg += f" View logs with `kubectl logs -n {NAMESPACE} {pod_name}`."
|
340
|
+
raise Exception(msg)
|
333
341
|
try:
|
334
342
|
pods_from_job = await core_v1.list_namespaced_pod(
|
335
343
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
@@ -39,12 +39,13 @@ def install_deps(
|
|
39
39
|
deps (str[], None): The dependencies that failed to install
|
40
40
|
"""
|
41
41
|
try:
|
42
|
+
subprocess.check_output(["pip", "install", "uv"], stderr=subprocess.STDOUT)
|
42
43
|
# Include only uri if @ is present
|
43
44
|
clean_deps = [d.split("@")[-1].strip() if "@" in d else d for d in deps]
|
44
45
|
index_args = ["--extra-index-url", extra_index] if extra_index else []
|
45
46
|
print("installing {}...".format(", ".join(clean_deps)))
|
46
47
|
opts = opts or []
|
47
|
-
args = ["pip", "install"] + opts + clean_deps + index_args
|
48
|
+
args = ["uv", "pip", "install"] + opts + clean_deps + index_args
|
48
49
|
sys.stdout.flush()
|
49
50
|
subprocess.check_output(args, stderr=subprocess.STDOUT)
|
50
51
|
return failed
|
wandb/sdk/launch/create_job.py
CHANGED
@@ -114,6 +114,7 @@ def _create_job(
|
|
114
114
|
git_hash: Optional[str] = None,
|
115
115
|
build_context: Optional[str] = None,
|
116
116
|
dockerfile: Optional[str] = None,
|
117
|
+
base_image: Optional[str] = None,
|
117
118
|
) -> Tuple[Optional[Artifact], str, List[str]]:
|
118
119
|
wandb.termlog(f"Creating launch job of type: {job_type}...")
|
119
120
|
|
@@ -188,6 +189,7 @@ def _create_job(
|
|
188
189
|
api.api,
|
189
190
|
dockerfile=dockerfile,
|
190
191
|
build_context=build_context,
|
192
|
+
base_image=base_image,
|
191
193
|
)
|
192
194
|
if not artifact:
|
193
195
|
wandb.termerror("JobBuilder failed to build a job")
|