wandb 0.17.5__py3-none-any.whl → 0.17.7__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +5 -16
- wandb/agents/pyagent.py +1 -2
- wandb/apis/public/api.py +1 -1
- wandb/apis/public/jobs.py +5 -0
- wandb/bin/nvidia_gpu_stats +0 -0
- wandb/cli/cli.py +21 -0
- wandb/data_types.py +5 -4
- wandb/env.py +6 -0
- wandb/integration/kfp/wandb_logging.py +1 -1
- wandb/integration/lightning/fabric/logger.py +5 -5
- wandb/integration/openai/fine_tuning.py +13 -5
- wandb/integration/ultralytics/pose_utils.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +226 -226
- wandb/proto/v3/wandb_settings_pb2.py +1 -1
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +226 -226
- wandb/proto/v4/wandb_settings_pb2.py +1 -1
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_internal_pb2.py +226 -226
- wandb/proto/v5/wandb_settings_pb2.py +1 -1
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/wandb_deprecated.py +4 -0
- wandb/proto/wandb_internal_pb2.py +6 -0
- wandb/sdk/artifacts/artifact.py +6 -1
- wandb/sdk/artifacts/artifact_manifest_entry.py +31 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +35 -23
- wandb/sdk/data_types/_dtypes.py +5 -5
- wandb/sdk/data_types/base_types/media.py +3 -1
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
- wandb/sdk/data_types/helper_types/image_mask.py +3 -1
- wandb/sdk/data_types/image.py +3 -1
- wandb/sdk/data_types/object_3d.py +113 -2
- wandb/sdk/data_types/saved_model.py +3 -1
- wandb/sdk/interface/interface.py +40 -16
- wandb/sdk/interface/interface_shared.py +6 -9
- wandb/sdk/internal/datastore.py +1 -1
- wandb/sdk/internal/handler.py +0 -2
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/job_builder.py +5 -2
- wandb/sdk/internal/sender.py +31 -15
- wandb/sdk/internal/tb_watcher.py +2 -2
- wandb/sdk/internal/update.py +2 -2
- wandb/sdk/launch/_launch.py +4 -2
- wandb/sdk/launch/_project_spec.py +34 -8
- wandb/sdk/launch/agent/agent.py +6 -2
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -4
- wandb/sdk/launch/builder/build.py +4 -2
- wandb/sdk/launch/builder/kaniko_builder.py +13 -5
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +2 -1
- wandb/sdk/launch/create_job.py +2 -0
- wandb/sdk/launch/inputs/internal.py +42 -28
- wandb/sdk/launch/inputs/schema.py +39 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +72 -0
- wandb/sdk/launch/runner/local_container.py +13 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +3 -5
- wandb/sdk/launch/utils.py +2 -0
- wandb/sdk/lib/apikey.py +1 -1
- wandb/sdk/lib/disabled.py +13 -174
- wandb/sdk/service/streams.py +2 -4
- wandb/sdk/wandb_config.py +1 -1
- wandb/sdk/wandb_init.py +77 -33
- wandb/sdk/wandb_login.py +6 -6
- wandb/sdk/wandb_run.py +150 -90
- wandb/sdk/wandb_settings.py +4 -3
- wandb/sdk/wandb_setup.py +66 -3
- wandb/sdk/wandb_sweep.py +5 -2
- wandb/wandb_agent.py +2 -0
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/METADATA +3 -2
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/RECORD +72 -70
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/WHEEL +0 -0
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/entry_points.txt +0 -0
- {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/licenses/LICENSE +0 -0
wandb/sdk/interface/interface.py
CHANGED
@@ -8,11 +8,14 @@ InterfaceRelay: Responses are routed to a relay queue (not matching uuids)
|
|
8
8
|
|
9
9
|
"""
|
10
10
|
|
11
|
+
import gzip
|
11
12
|
import logging
|
12
13
|
import os
|
13
14
|
import sys
|
14
15
|
import time
|
15
16
|
from abc import abstractmethod
|
17
|
+
from pathlib import Path
|
18
|
+
from secrets import token_hex
|
16
19
|
from typing import (
|
17
20
|
TYPE_CHECKING,
|
18
21
|
Any,
|
@@ -47,20 +50,24 @@ from ..lib.mailbox import MailboxHandle
|
|
47
50
|
from . import summary_record as sr
|
48
51
|
from .message_future import MessageFuture
|
49
52
|
|
53
|
+
MANIFEST_FILE_SIZE_THRESHOLD = 100_000
|
54
|
+
|
50
55
|
GlobStr = NewType("GlobStr", str)
|
51
56
|
|
52
|
-
if
|
53
|
-
from
|
57
|
+
if sys.version_info >= (3, 8):
|
58
|
+
from typing import Literal, TypedDict
|
59
|
+
else:
|
60
|
+
from typing_extensions import Literal, TypedDict
|
54
61
|
|
55
|
-
|
56
|
-
from typing import Literal, TypedDict
|
57
|
-
else:
|
58
|
-
from typing_extensions import Literal, TypedDict
|
62
|
+
PolicyName = Literal["now", "live", "end"]
|
59
63
|
|
60
|
-
PolicyName = Literal["now", "live", "end"]
|
61
64
|
|
62
|
-
|
63
|
-
|
65
|
+
class FilesDict(TypedDict):
|
66
|
+
files: Iterable[Tuple[GlobStr, PolicyName]]
|
67
|
+
|
68
|
+
|
69
|
+
if TYPE_CHECKING:
|
70
|
+
from ..wandb_run import Run
|
64
71
|
|
65
72
|
|
66
73
|
logger = logging.getLogger("wandb")
|
@@ -107,15 +114,14 @@ class InterfaceBase:
|
|
107
114
|
def _publish_header(self, header: pb.HeaderRecord) -> None:
|
108
115
|
raise NotImplementedError
|
109
116
|
|
110
|
-
def
|
111
|
-
|
112
|
-
resp = self._communicate_status(status)
|
113
|
-
return resp
|
117
|
+
def deliver_status(self) -> MailboxHandle:
|
118
|
+
return self._deliver_status(pb.StatusRequest())
|
114
119
|
|
115
120
|
@abstractmethod
|
116
|
-
def
|
117
|
-
self,
|
118
|
-
|
121
|
+
def _deliver_status(
|
122
|
+
self,
|
123
|
+
status: pb.StatusRequest,
|
124
|
+
) -> MailboxHandle:
|
119
125
|
raise NotImplementedError
|
120
126
|
|
121
127
|
def _make_config(
|
@@ -334,6 +340,12 @@ class InterfaceBase:
|
|
334
340
|
proto_manifest.version = artifact_manifest.version()
|
335
341
|
proto_manifest.storage_policy = artifact_manifest.storage_policy.name()
|
336
342
|
|
343
|
+
# Very large manifests need to be written to file to avoid protobuf size limits.
|
344
|
+
if len(artifact_manifest) > MANIFEST_FILE_SIZE_THRESHOLD:
|
345
|
+
path = self._write_artifact_manifest_file(artifact_manifest)
|
346
|
+
proto_manifest.manifest_file_path = path
|
347
|
+
return proto_manifest
|
348
|
+
|
337
349
|
for k, v in artifact_manifest.storage_policy.config().items() or {}.items():
|
338
350
|
cfg = proto_manifest.storage_policy_config.add()
|
339
351
|
cfg.key = k
|
@@ -358,6 +370,18 @@ class InterfaceBase:
|
|
358
370
|
proto_extra.value_json = json.dumps(v)
|
359
371
|
return proto_manifest
|
360
372
|
|
373
|
+
def _write_artifact_manifest_file(self, manifest: ArtifactManifest) -> str:
|
374
|
+
manifest_dir = Path(get_staging_dir()) / "artifact_manifests"
|
375
|
+
manifest_dir.mkdir(parents=True, exist_ok=True)
|
376
|
+
# It would be simpler to use `manifest.to_json()`, but that gets very slow for
|
377
|
+
# large manifests since it encodes the whole thing as a single JSON object.
|
378
|
+
filename = f"{time.time()}_{token_hex(8)}.manifest_contents.jl.gz"
|
379
|
+
manifest_file_path = manifest_dir / filename
|
380
|
+
with gzip.open(manifest_file_path, mode="wt", compresslevel=1) as f:
|
381
|
+
for entry in manifest.entries.values():
|
382
|
+
f.write(f"{json.dumps(entry.to_json())}\n")
|
383
|
+
return str(manifest_file_path)
|
384
|
+
|
361
385
|
def deliver_link_artifact(
|
362
386
|
self,
|
363
387
|
run: "Run",
|
@@ -299,7 +299,7 @@ class InterfaceShared(InterfaceBase):
|
|
299
299
|
raise NotImplementedError
|
300
300
|
|
301
301
|
def _communicate(
|
302
|
-
self, rec: pb.Record, timeout: Optional[int] =
|
302
|
+
self, rec: pb.Record, timeout: Optional[int] = 30, local: Optional[bool] = None
|
303
303
|
) -> Optional[pb.Result]:
|
304
304
|
return self._communicate_async(rec, local=local).get(timeout=timeout)
|
305
305
|
|
@@ -421,15 +421,12 @@ class InterfaceShared(InterfaceBase):
|
|
421
421
|
rec = self._make_record(alert=proto_alert)
|
422
422
|
self._publish(rec)
|
423
423
|
|
424
|
-
def
|
425
|
-
self,
|
426
|
-
|
424
|
+
def _deliver_status(
|
425
|
+
self,
|
426
|
+
status: pb.StatusRequest,
|
427
|
+
) -> MailboxHandle:
|
427
428
|
req = self._make_request(status=status)
|
428
|
-
|
429
|
-
if resp is None:
|
430
|
-
return None
|
431
|
-
assert resp.response.status_response
|
432
|
-
return resp.response.status_response
|
429
|
+
return self._deliver_record(req)
|
433
430
|
|
434
431
|
def _publish_exit(self, exit_data: pb.RunExitRecord) -> None:
|
435
432
|
rec = self._make_record(exit=exit_data)
|
wandb/sdk/internal/datastore.py
CHANGED
wandb/sdk/internal/handler.py
CHANGED
@@ -745,8 +745,6 @@ class HandleManager:
|
|
745
745
|
self._respond_result(result)
|
746
746
|
|
747
747
|
def handle_request_status(self, record: Record) -> None:
|
748
|
-
# TODO(mempressure): do something better?
|
749
|
-
assert record.control.req_resp
|
750
748
|
result = proto_util._result_from_record(record)
|
751
749
|
self._respond_result(result)
|
752
750
|
|
wandb/sdk/internal/internal.py
CHANGED
@@ -423,15 +423,18 @@ class JobBuilder:
|
|
423
423
|
api: Api,
|
424
424
|
build_context: Optional[str] = None,
|
425
425
|
dockerfile: Optional[str] = None,
|
426
|
+
base_image: Optional[str] = None,
|
426
427
|
) -> Optional[Artifact]:
|
427
428
|
"""Build a job artifact from the current run.
|
428
429
|
|
429
430
|
Arguments:
|
431
|
+
api (Api): The API object to use to create the job artifact.
|
430
432
|
build_context (Optional[str]): Path within the job source code to
|
431
433
|
the image build context. Saved as part of the job for future
|
432
434
|
builds.
|
433
435
|
dockerfile (Optional[str]): Path within the build context the
|
434
436
|
Dockerfile. Saved as part of the job for future builds.
|
437
|
+
base_image (Optional[str]): The base image used to run the job code.
|
435
438
|
|
436
439
|
Returns:
|
437
440
|
Optional[Artifact]: The job artifact if it was successfully built,
|
@@ -467,8 +470,6 @@ class JobBuilder:
|
|
467
470
|
"warn",
|
468
471
|
)
|
469
472
|
return None
|
470
|
-
metadata["dockerfile"] = dockerfile
|
471
|
-
metadata["build_context"] = build_context
|
472
473
|
|
473
474
|
runtime: Optional[str] = metadata.get("python")
|
474
475
|
# can't build a job without a python version
|
@@ -520,6 +521,8 @@ class JobBuilder:
|
|
520
521
|
source["build_context"] = build_context # type: ignore[typeddict-item]
|
521
522
|
if dockerfile:
|
522
523
|
source["dockerfile"] = dockerfile # type: ignore[typeddict-item]
|
524
|
+
if base_image:
|
525
|
+
source["base_image"] = base_image # type: ignore[typeddict-item]
|
523
526
|
|
524
527
|
# Pop any keys that are initialized to None. The current TypedDict
|
525
528
|
# system for source dicts requires all keys to be present, but we
|
wandb/sdk/internal/sender.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
"""sender."""
|
2
2
|
|
3
|
+
import contextlib
|
4
|
+
import gzip
|
3
5
|
import json
|
4
6
|
import logging
|
5
7
|
import os
|
@@ -66,6 +68,7 @@ else:
|
|
66
68
|
if TYPE_CHECKING:
|
67
69
|
from wandb.proto.wandb_internal_pb2 import (
|
68
70
|
ArtifactManifest,
|
71
|
+
ArtifactManifestEntry,
|
69
72
|
ArtifactRecord,
|
70
73
|
HttpResponse,
|
71
74
|
LocalInfo,
|
@@ -105,22 +108,18 @@ def _framework_priority() -> Generator[Tuple[str, str], None, None]:
|
|
105
108
|
|
106
109
|
def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
107
110
|
if manifest.version == 1:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
extra.key: json.loads(extra.value_json) for extra in content.extra
|
120
|
-
},
|
111
|
+
if manifest.manifest_file_path:
|
112
|
+
contents = {}
|
113
|
+
with gzip.open(manifest.manifest_file_path, "rt") as f:
|
114
|
+
for line in f:
|
115
|
+
entry_json = json.loads(line)
|
116
|
+
path = entry_json.pop("path")
|
117
|
+
contents[path] = entry_json
|
118
|
+
else:
|
119
|
+
contents = {
|
120
|
+
content.path: _manifest_entry_from_proto(content)
|
121
|
+
for content in manifest.contents
|
121
122
|
}
|
122
|
-
for content in manifest.contents
|
123
|
-
}
|
124
123
|
else:
|
125
124
|
raise ValueError(f"unknown artifact manifest version: {manifest.version}")
|
126
125
|
|
@@ -135,6 +134,19 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
|
135
134
|
}
|
136
135
|
|
137
136
|
|
137
|
+
def _manifest_entry_from_proto(entry: "ArtifactManifestEntry") -> Dict:
|
138
|
+
birth_artifact_id = entry.birth_artifact_id if entry.birth_artifact_id else None
|
139
|
+
return {
|
140
|
+
"digest": entry.digest,
|
141
|
+
"birthArtifactID": birth_artifact_id,
|
142
|
+
"ref": entry.ref if entry.ref else None,
|
143
|
+
"size": entry.size if entry.size is not None else None,
|
144
|
+
"local_path": entry.local_path if entry.local_path else None,
|
145
|
+
"skip_cache": entry.skip_cache,
|
146
|
+
"extra": {extra.key: json.loads(extra.value_json) for extra in entry.extra},
|
147
|
+
}
|
148
|
+
|
149
|
+
|
138
150
|
class ResumeState:
|
139
151
|
resumed: bool
|
140
152
|
step: int
|
@@ -1586,6 +1598,10 @@ class SendManager:
|
|
1586
1598
|
)
|
1587
1599
|
|
1588
1600
|
self._job_builder._handle_server_artifact(res, artifact)
|
1601
|
+
|
1602
|
+
if artifact.manifest.manifest_file_path:
|
1603
|
+
with contextlib.suppress(FileNotFoundError):
|
1604
|
+
os.remove(artifact.manifest.manifest_file_path)
|
1589
1605
|
return res
|
1590
1606
|
|
1591
1607
|
def send_alert(self, record: "Record") -> None:
|
wandb/sdk/internal/tb_watcher.py
CHANGED
@@ -123,7 +123,7 @@ class TBWatcher:
|
|
123
123
|
self._force = force
|
124
124
|
# TODO(jhr): do we need locking in this queue?
|
125
125
|
self._watcher_queue = queue.PriorityQueue()
|
126
|
-
wandb.tensorboard.reset_state()
|
126
|
+
wandb.tensorboard.reset_state() # type: ignore
|
127
127
|
|
128
128
|
def _calculate_namespace(self, logdir: str, rootdir: str) -> Optional[str]:
|
129
129
|
namespace: Optional[str]
|
@@ -430,7 +430,7 @@ class TBEventConsumer:
|
|
430
430
|
def _handle_event(
|
431
431
|
self, event: "ProtoEvent", history: Optional["TBHistory"] = None
|
432
432
|
) -> None:
|
433
|
-
wandb.tensorboard._log(
|
433
|
+
wandb.tensorboard._log( # type: ignore
|
434
434
|
event.event,
|
435
435
|
step=event.event.step,
|
436
436
|
namespace=event.namespace,
|
wandb/sdk/internal/update.py
CHANGED
@@ -10,7 +10,7 @@ def _find_available(
|
|
10
10
|
) -> Optional[Tuple[str, bool, bool, bool, Optional[str]]]:
|
11
11
|
from wandb.util import parse_version
|
12
12
|
|
13
|
-
pypi_url =
|
13
|
+
pypi_url = "https://pypi.org/pypi/wandb/json"
|
14
14
|
|
15
15
|
yanked_dict = {}
|
16
16
|
try:
|
@@ -78,7 +78,7 @@ def check_available(current_version: str) -> Optional[Dict[str, Optional[str]]]:
|
|
78
78
|
if not package_info:
|
79
79
|
return None
|
80
80
|
|
81
|
-
wandb_module_name = wandb
|
81
|
+
wandb_module_name = "wandb"
|
82
82
|
|
83
83
|
latest_version, pip_prerelease, deleted, yanked, yanked_reason = package_info
|
84
84
|
upgrade_message = (
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -211,7 +211,9 @@ async def _launch(
|
|
211
211
|
launch_project = LaunchProject.from_spec(launch_spec, api)
|
212
212
|
launch_project.fetch_and_validate_project()
|
213
213
|
entrypoint = launch_project.get_job_entry_point()
|
214
|
-
image_uri =
|
214
|
+
image_uri = (
|
215
|
+
launch_project.docker_image or launch_project.job_base_image
|
216
|
+
) # Either set by user or None.
|
215
217
|
|
216
218
|
# construct runner config.
|
217
219
|
runner_config: Dict[str, Any] = {}
|
@@ -224,7 +226,7 @@ async def _launch(
|
|
224
226
|
await environment.verify()
|
225
227
|
registry = loader.registry_from_config(registry_config, environment)
|
226
228
|
builder = loader.builder_from_config(build_config, environment, registry)
|
227
|
-
if not launch_project.docker_image:
|
229
|
+
if not (launch_project.docker_image or launch_project.job_base_image):
|
228
230
|
assert entrypoint
|
229
231
|
image_uri = await builder.build_image(launch_project, entrypoint, None)
|
230
232
|
backend = loader.runner_from_config(
|
@@ -7,6 +7,7 @@ import enum
|
|
7
7
|
import json
|
8
8
|
import logging
|
9
9
|
import os
|
10
|
+
import shutil
|
10
11
|
import tempfile
|
11
12
|
from copy import deepcopy
|
12
13
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
|
@@ -112,6 +113,9 @@ class LaunchProject:
|
|
112
113
|
self.sweep_id = sweep_id
|
113
114
|
self.author = launch_spec.get("author")
|
114
115
|
self.python_version: Optional[str] = launch_spec.get("python_version")
|
116
|
+
self._job_dockerfile: Optional[str] = None
|
117
|
+
self._job_build_context: Optional[str] = None
|
118
|
+
self._job_base_image: Optional[str] = None
|
115
119
|
self.accelerator_base_image: Optional[str] = resource_args_build.get(
|
116
120
|
"accelerator", {}
|
117
121
|
).get("base_image") or resource_args_build.get("cuda", {}).get("base_image")
|
@@ -131,8 +135,6 @@ class LaunchProject:
|
|
131
135
|
self._queue_name: Optional[str] = None
|
132
136
|
self._queue_entity: Optional[str] = None
|
133
137
|
self._run_queue_item_id: Optional[str] = None
|
134
|
-
self._job_dockerfile: Optional[str] = None
|
135
|
-
self._job_build_context: Optional[str] = None
|
136
138
|
|
137
139
|
def init_source(self) -> None:
|
138
140
|
if self.docker_image is not None:
|
@@ -146,6 +148,21 @@ class LaunchProject:
|
|
146
148
|
self.project_dir = os.getcwd()
|
147
149
|
self._entry_point = self.override_entrypoint
|
148
150
|
|
151
|
+
def change_project_dir(self, new_dir: str) -> None:
|
152
|
+
"""Change the project directory to a new directory."""
|
153
|
+
# Copy the contents of the old project dir to the new project dir.
|
154
|
+
old_dir = self.project_dir
|
155
|
+
if old_dir is not None:
|
156
|
+
shutil.copytree(
|
157
|
+
old_dir,
|
158
|
+
new_dir,
|
159
|
+
symlinks=True,
|
160
|
+
dirs_exist_ok=True,
|
161
|
+
ignore=shutil.ignore_patterns("fsmonitor--daemon.ipc", ".git"),
|
162
|
+
)
|
163
|
+
shutil.rmtree(old_dir)
|
164
|
+
self.project_dir = new_dir
|
165
|
+
|
149
166
|
def init_git(self, git_info: Dict[str, str]) -> None:
|
150
167
|
self.git_version = git_info.get("version")
|
151
168
|
self.git_repo = git_info.get("repo")
|
@@ -212,14 +229,23 @@ class LaunchProject:
|
|
212
229
|
def job_build_context(self) -> Optional[str]:
|
213
230
|
return self._job_build_context
|
214
231
|
|
232
|
+
@property
|
233
|
+
def job_base_image(self) -> Optional[str]:
|
234
|
+
return self._job_base_image
|
235
|
+
|
215
236
|
def set_job_dockerfile(self, dockerfile: str) -> None:
|
216
237
|
self._job_dockerfile = dockerfile
|
217
238
|
|
218
239
|
def set_job_build_context(self, build_context: str) -> None:
|
219
240
|
self._job_build_context = build_context
|
220
241
|
|
242
|
+
def set_job_base_image(self, base_image: str) -> None:
|
243
|
+
self._job_base_image = base_image
|
244
|
+
|
221
245
|
@property
|
222
246
|
def image_name(self) -> str:
|
247
|
+
if self.job_base_image is not None:
|
248
|
+
return self.job_base_image
|
223
249
|
if self.docker_image is not None:
|
224
250
|
return self.docker_image
|
225
251
|
elif self.uri is not None:
|
@@ -299,10 +325,8 @@ class LaunchProject:
|
|
299
325
|
|
300
326
|
def build_required(self) -> bool:
|
301
327
|
"""Checks the source to see if a build is required."""
|
302
|
-
|
303
|
-
|
304
|
-
# we don't need to build the image for a job if that tag
|
305
|
-
# already exists
|
328
|
+
if self.job_base_image is not None:
|
329
|
+
return False
|
306
330
|
if self.source != LaunchSource.JOB:
|
307
331
|
return True
|
308
332
|
return False
|
@@ -316,7 +340,9 @@ class LaunchProject:
|
|
316
340
|
Returns:
|
317
341
|
Optional[str]: The Docker image or None if not specified.
|
318
342
|
"""
|
319
|
-
|
343
|
+
if self._docker_image:
|
344
|
+
return self._docker_image
|
345
|
+
return None
|
320
346
|
|
321
347
|
@docker_image.setter
|
322
348
|
def docker_image(self, value: str) -> None:
|
@@ -336,7 +362,7 @@ class LaunchProject:
|
|
336
362
|
# assuming project only has 1 entry point, pull that out
|
337
363
|
# tmp fn until we figure out if we want to support multiple entry points or not
|
338
364
|
if not self._entry_point:
|
339
|
-
if not self.docker_image:
|
365
|
+
if not self.docker_image and not self.job_base_image:
|
340
366
|
raise LaunchError(
|
341
367
|
"Project must have at least one entry point unless docker image is specified."
|
342
368
|
)
|
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -717,7 +717,7 @@ class LaunchAgent:
|
|
717
717
|
_, build_config, registry_config = construct_agent_configs(
|
718
718
|
default_config, override_build_config
|
719
719
|
)
|
720
|
-
image_uri = project.docker_image
|
720
|
+
image_uri = project.docker_image or project.job_base_image
|
721
721
|
entrypoint = project.get_job_entry_point()
|
722
722
|
environment = loader.environment_from_config(
|
723
723
|
default_config.get("environment", {})
|
@@ -727,7 +727,11 @@ class LaunchAgent:
|
|
727
727
|
backend = loader.runner_from_config(
|
728
728
|
resource, api, backend_config, environment, registry
|
729
729
|
)
|
730
|
-
if not (
|
730
|
+
if not (
|
731
|
+
project.docker_image
|
732
|
+
or project.job_base_image
|
733
|
+
or isinstance(backend, LocalProcessRunner)
|
734
|
+
):
|
731
735
|
assert entrypoint is not None
|
732
736
|
image_uri = await builder.build_image(project, entrypoint, job_tracker)
|
733
737
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
from typing import List, Optional
|
5
|
+
from typing import List, Optional
|
6
6
|
|
7
7
|
import wandb
|
8
8
|
|
@@ -17,9 +17,7 @@ FileSubtypes = Literal["warning", "error"]
|
|
17
17
|
class RunQueueItemFileSaver:
|
18
18
|
def __init__(
|
19
19
|
self,
|
20
|
-
agent_run: Optional[
|
21
|
-
Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
|
22
|
-
],
|
20
|
+
agent_run: Optional["wandb.sdk.wandb_run.Run"],
|
23
21
|
run_queue_item_id: str,
|
24
22
|
):
|
25
23
|
self.run_queue_item_id = run_queue_item_id
|
@@ -201,7 +201,7 @@ def get_requirements_section(
|
|
201
201
|
# If there is a requirements.txt at root of build context, use that.
|
202
202
|
if (base_path / "src" / "requirements.txt").exists():
|
203
203
|
requirements_files += ["src/requirements.txt"]
|
204
|
-
deps_install_line = "pip install -r requirements.txt"
|
204
|
+
deps_install_line = "pip install uv && uv pip install -r requirements.txt"
|
205
205
|
with open(base_path / "src" / "requirements.txt") as f:
|
206
206
|
requirements = f.readlines()
|
207
207
|
if not any(["wandb" in r for r in requirements]):
|
@@ -237,7 +237,9 @@ def get_requirements_section(
|
|
237
237
|
with open(base_path / "src" / "requirements.txt", "w") as f:
|
238
238
|
f.write("\n".join(project_deps))
|
239
239
|
requirements_files += ["src/requirements.txt"]
|
240
|
-
deps_install_line =
|
240
|
+
deps_install_line = (
|
241
|
+
"pip install uv && uv pip install -r requirements.txt"
|
242
|
+
)
|
241
243
|
return PIP_TEMPLATE.format(
|
242
244
|
buildx_optional_prefix=prefix,
|
243
245
|
requirements_files=" ".join(requirements_files),
|
@@ -63,6 +63,13 @@ else:
|
|
63
63
|
NAMESPACE = "wandb"
|
64
64
|
|
65
65
|
|
66
|
+
def get_pod_name_safe(job: client.V1Job):
|
67
|
+
try:
|
68
|
+
return job.spec.template.metadata.name
|
69
|
+
except AttributeError:
|
70
|
+
return None
|
71
|
+
|
72
|
+
|
66
73
|
async def _wait_for_completion(
|
67
74
|
batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
|
68
75
|
) -> bool:
|
@@ -319,17 +326,18 @@ class KanikoBuilder(AbstractBuilder):
|
|
319
326
|
await self._create_docker_ecr_config_map(
|
320
327
|
build_job_name, core_v1, repo_uri
|
321
328
|
)
|
322
|
-
await batch_v1.create_namespaced_job(NAMESPACE, build_job)
|
323
|
-
|
329
|
+
k8s_job = await batch_v1.create_namespaced_job(NAMESPACE, build_job)
|
324
330
|
# wait for double the job deadline since it might take time to schedule
|
325
331
|
if not await _wait_for_completion(
|
326
332
|
batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
|
327
333
|
):
|
328
334
|
if job_tracker:
|
329
335
|
job_tracker.set_err_stage("build")
|
330
|
-
|
331
|
-
|
332
|
-
|
336
|
+
msg = f"Failed to build image in kaniko for job {run_id}."
|
337
|
+
pod_name = get_pod_name_safe(k8s_job)
|
338
|
+
if pod_name:
|
339
|
+
msg += f" View logs with `kubectl logs -n {NAMESPACE} {pod_name}`."
|
340
|
+
raise Exception(msg)
|
333
341
|
try:
|
334
342
|
pods_from_job = await core_v1.list_namespaced_pod(
|
335
343
|
namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
|
@@ -39,12 +39,13 @@ def install_deps(
|
|
39
39
|
deps (str[], None): The dependencies that failed to install
|
40
40
|
"""
|
41
41
|
try:
|
42
|
+
subprocess.check_output(["pip", "install", "uv"], stderr=subprocess.STDOUT)
|
42
43
|
# Include only uri if @ is present
|
43
44
|
clean_deps = [d.split("@")[-1].strip() if "@" in d else d for d in deps]
|
44
45
|
index_args = ["--extra-index-url", extra_index] if extra_index else []
|
45
46
|
print("installing {}...".format(", ".join(clean_deps)))
|
46
47
|
opts = opts or []
|
47
|
-
args = ["pip", "install"] + opts + clean_deps + index_args
|
48
|
+
args = ["uv", "pip", "install"] + opts + clean_deps + index_args
|
48
49
|
sys.stdout.flush()
|
49
50
|
subprocess.check_output(args, stderr=subprocess.STDOUT)
|
50
51
|
return failed
|
wandb/sdk/launch/create_job.py
CHANGED
@@ -114,6 +114,7 @@ def _create_job(
|
|
114
114
|
git_hash: Optional[str] = None,
|
115
115
|
build_context: Optional[str] = None,
|
116
116
|
dockerfile: Optional[str] = None,
|
117
|
+
base_image: Optional[str] = None,
|
117
118
|
) -> Tuple[Optional[Artifact], str, List[str]]:
|
118
119
|
wandb.termlog(f"Creating launch job of type: {job_type}...")
|
119
120
|
|
@@ -188,6 +189,7 @@ def _create_job(
|
|
188
189
|
api.api,
|
189
190
|
dockerfile=dockerfile,
|
190
191
|
build_context=build_context,
|
192
|
+
base_image=base_image,
|
191
193
|
)
|
192
194
|
if not artifact:
|
193
195
|
wandb.termerror("JobBuilder failed to build a job")
|