wandb 0.17.5__py3-none-any.whl → 0.17.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. wandb/__init__.py +5 -16
  2. wandb/agents/pyagent.py +1 -2
  3. wandb/apis/public/api.py +1 -1
  4. wandb/apis/public/jobs.py +5 -0
  5. wandb/bin/nvidia_gpu_stats +0 -0
  6. wandb/cli/cli.py +21 -0
  7. wandb/data_types.py +5 -4
  8. wandb/env.py +6 -0
  9. wandb/integration/kfp/wandb_logging.py +1 -1
  10. wandb/integration/lightning/fabric/logger.py +5 -5
  11. wandb/integration/openai/fine_tuning.py +13 -5
  12. wandb/integration/ultralytics/pose_utils.py +0 -1
  13. wandb/proto/v3/wandb_internal_pb2.py +226 -226
  14. wandb/proto/v3/wandb_settings_pb2.py +1 -1
  15. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  16. wandb/proto/v4/wandb_internal_pb2.py +226 -226
  17. wandb/proto/v4/wandb_settings_pb2.py +1 -1
  18. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  19. wandb/proto/v5/wandb_internal_pb2.py +226 -226
  20. wandb/proto/v5/wandb_settings_pb2.py +1 -1
  21. wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
  22. wandb/proto/wandb_deprecated.py +4 -0
  23. wandb/proto/wandb_internal_pb2.py +6 -0
  24. wandb/sdk/artifacts/artifact.py +6 -1
  25. wandb/sdk/artifacts/artifact_manifest_entry.py +31 -0
  26. wandb/sdk/artifacts/storage_handlers/azure_handler.py +35 -23
  27. wandb/sdk/data_types/_dtypes.py +5 -5
  28. wandb/sdk/data_types/base_types/media.py +3 -1
  29. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
  30. wandb/sdk/data_types/helper_types/image_mask.py +3 -1
  31. wandb/sdk/data_types/image.py +3 -1
  32. wandb/sdk/data_types/object_3d.py +113 -2
  33. wandb/sdk/data_types/saved_model.py +3 -1
  34. wandb/sdk/interface/interface.py +40 -16
  35. wandb/sdk/interface/interface_shared.py +6 -9
  36. wandb/sdk/internal/datastore.py +1 -1
  37. wandb/sdk/internal/handler.py +0 -2
  38. wandb/sdk/internal/internal.py +1 -1
  39. wandb/sdk/internal/job_builder.py +5 -2
  40. wandb/sdk/internal/sender.py +31 -15
  41. wandb/sdk/internal/tb_watcher.py +2 -2
  42. wandb/sdk/internal/update.py +2 -2
  43. wandb/sdk/launch/_launch.py +4 -2
  44. wandb/sdk/launch/_project_spec.py +34 -8
  45. wandb/sdk/launch/agent/agent.py +6 -2
  46. wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -4
  47. wandb/sdk/launch/builder/build.py +4 -2
  48. wandb/sdk/launch/builder/kaniko_builder.py +13 -5
  49. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +2 -1
  50. wandb/sdk/launch/create_job.py +2 -0
  51. wandb/sdk/launch/inputs/internal.py +42 -28
  52. wandb/sdk/launch/inputs/schema.py +39 -0
  53. wandb/sdk/launch/runner/kubernetes_runner.py +72 -0
  54. wandb/sdk/launch/runner/local_container.py +13 -10
  55. wandb/sdk/launch/runner/sagemaker_runner.py +3 -5
  56. wandb/sdk/launch/utils.py +2 -0
  57. wandb/sdk/lib/apikey.py +1 -1
  58. wandb/sdk/lib/disabled.py +13 -174
  59. wandb/sdk/service/streams.py +2 -4
  60. wandb/sdk/wandb_config.py +1 -1
  61. wandb/sdk/wandb_init.py +77 -33
  62. wandb/sdk/wandb_login.py +6 -6
  63. wandb/sdk/wandb_run.py +150 -90
  64. wandb/sdk/wandb_settings.py +4 -3
  65. wandb/sdk/wandb_setup.py +66 -3
  66. wandb/sdk/wandb_sweep.py +5 -2
  67. wandb/wandb_agent.py +2 -0
  68. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/METADATA +3 -2
  69. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/RECORD +72 -70
  70. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/WHEEL +0 -0
  71. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/entry_points.txt +0 -0
  72. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/licenses/LICENSE +0 -0
@@ -8,11 +8,14 @@ InterfaceRelay: Responses are routed to a relay queue (not matching uuids)
8
8
 
9
9
  """
10
10
 
11
+ import gzip
11
12
  import logging
12
13
  import os
13
14
  import sys
14
15
  import time
15
16
  from abc import abstractmethod
17
+ from pathlib import Path
18
+ from secrets import token_hex
16
19
  from typing import (
17
20
  TYPE_CHECKING,
18
21
  Any,
@@ -47,20 +50,24 @@ from ..lib.mailbox import MailboxHandle
47
50
  from . import summary_record as sr
48
51
  from .message_future import MessageFuture
49
52
 
53
+ MANIFEST_FILE_SIZE_THRESHOLD = 100_000
54
+
50
55
  GlobStr = NewType("GlobStr", str)
51
56
 
52
- if TYPE_CHECKING:
53
- from ..wandb_run import Run
57
+ if sys.version_info >= (3, 8):
58
+ from typing import Literal, TypedDict
59
+ else:
60
+ from typing_extensions import Literal, TypedDict
54
61
 
55
- if sys.version_info >= (3, 8):
56
- from typing import Literal, TypedDict
57
- else:
58
- from typing_extensions import Literal, TypedDict
62
+ PolicyName = Literal["now", "live", "end"]
59
63
 
60
- PolicyName = Literal["now", "live", "end"]
61
64
 
62
- class FilesDict(TypedDict):
63
- files: Iterable[Tuple[GlobStr, PolicyName]]
65
+ class FilesDict(TypedDict):
66
+ files: Iterable[Tuple[GlobStr, PolicyName]]
67
+
68
+
69
+ if TYPE_CHECKING:
70
+ from ..wandb_run import Run
64
71
 
65
72
 
66
73
  logger = logging.getLogger("wandb")
@@ -107,15 +114,14 @@ class InterfaceBase:
107
114
  def _publish_header(self, header: pb.HeaderRecord) -> None:
108
115
  raise NotImplementedError
109
116
 
110
- def communicate_status(self) -> Optional[pb.StatusResponse]:
111
- status = pb.StatusRequest()
112
- resp = self._communicate_status(status)
113
- return resp
117
+ def deliver_status(self) -> MailboxHandle:
118
+ return self._deliver_status(pb.StatusRequest())
114
119
 
115
120
  @abstractmethod
116
- def _communicate_status(
117
- self, status: pb.StatusRequest
118
- ) -> Optional[pb.StatusResponse]:
121
+ def _deliver_status(
122
+ self,
123
+ status: pb.StatusRequest,
124
+ ) -> MailboxHandle:
119
125
  raise NotImplementedError
120
126
 
121
127
  def _make_config(
@@ -334,6 +340,12 @@ class InterfaceBase:
334
340
  proto_manifest.version = artifact_manifest.version()
335
341
  proto_manifest.storage_policy = artifact_manifest.storage_policy.name()
336
342
 
343
+ # Very large manifests need to be written to file to avoid protobuf size limits.
344
+ if len(artifact_manifest) > MANIFEST_FILE_SIZE_THRESHOLD:
345
+ path = self._write_artifact_manifest_file(artifact_manifest)
346
+ proto_manifest.manifest_file_path = path
347
+ return proto_manifest
348
+
337
349
  for k, v in artifact_manifest.storage_policy.config().items() or {}.items():
338
350
  cfg = proto_manifest.storage_policy_config.add()
339
351
  cfg.key = k
@@ -358,6 +370,18 @@ class InterfaceBase:
358
370
  proto_extra.value_json = json.dumps(v)
359
371
  return proto_manifest
360
372
 
373
+ def _write_artifact_manifest_file(self, manifest: ArtifactManifest) -> str:
374
+ manifest_dir = Path(get_staging_dir()) / "artifact_manifests"
375
+ manifest_dir.mkdir(parents=True, exist_ok=True)
376
+ # It would be simpler to use `manifest.to_json()`, but that gets very slow for
377
+ # large manifests since it encodes the whole thing as a single JSON object.
378
+ filename = f"{time.time()}_{token_hex(8)}.manifest_contents.jl.gz"
379
+ manifest_file_path = manifest_dir / filename
380
+ with gzip.open(manifest_file_path, mode="wt", compresslevel=1) as f:
381
+ for entry in manifest.entries.values():
382
+ f.write(f"{json.dumps(entry.to_json())}\n")
383
+ return str(manifest_file_path)
384
+
361
385
  def deliver_link_artifact(
362
386
  self,
363
387
  run: "Run",
@@ -299,7 +299,7 @@ class InterfaceShared(InterfaceBase):
299
299
  raise NotImplementedError
300
300
 
301
301
  def _communicate(
302
- self, rec: pb.Record, timeout: Optional[int] = 5, local: Optional[bool] = None
302
+ self, rec: pb.Record, timeout: Optional[int] = 30, local: Optional[bool] = None
303
303
  ) -> Optional[pb.Result]:
304
304
  return self._communicate_async(rec, local=local).get(timeout=timeout)
305
305
 
@@ -421,15 +421,12 @@ class InterfaceShared(InterfaceBase):
421
421
  rec = self._make_record(alert=proto_alert)
422
422
  self._publish(rec)
423
423
 
424
- def _communicate_status(
425
- self, status: pb.StatusRequest
426
- ) -> Optional[pb.StatusResponse]:
424
+ def _deliver_status(
425
+ self,
426
+ status: pb.StatusRequest,
427
+ ) -> MailboxHandle:
427
428
  req = self._make_request(status=status)
428
- resp = self._communicate(req, local=True)
429
- if resp is None:
430
- return None
431
- assert resp.response.status_response
432
- return resp.response.status_response
429
+ return self._deliver_record(req)
433
430
 
434
431
  def _publish_exit(self, exit_data: pb.RunExitRecord) -> None:
435
432
  rec = self._make_record(exit=exit_data)
@@ -69,7 +69,7 @@ class DataStore:
69
69
 
70
70
  def __init__(self) -> None:
71
71
  self._opened_for_scan = False
72
- self._fp: Optional["IO[Any]"] = None
72
+ self._fp: Optional[IO[Any]] = None
73
73
  self._index = 0
74
74
  self._flush_offset = 0
75
75
  self._size_bytes = 0
@@ -745,8 +745,6 @@ class HandleManager:
745
745
  self._respond_result(result)
746
746
 
747
747
  def handle_request_status(self, record: Record) -> None:
748
- # TODO(mempressure): do something better?
749
- assert record.control.req_resp
750
748
  result = proto_util._result_from_record(record)
751
749
  self._respond_result(result)
752
750
 
@@ -62,7 +62,7 @@ def wandb_internal(
62
62
 
63
63
  """
64
64
  # mark this process as internal
65
- wandb._set_internal_process()
65
+ wandb._set_internal_process() # type: ignore
66
66
  _setup_tracelog()
67
67
  started = time.time()
68
68
 
@@ -423,15 +423,18 @@ class JobBuilder:
423
423
  api: Api,
424
424
  build_context: Optional[str] = None,
425
425
  dockerfile: Optional[str] = None,
426
+ base_image: Optional[str] = None,
426
427
  ) -> Optional[Artifact]:
427
428
  """Build a job artifact from the current run.
428
429
 
429
430
  Arguments:
431
+ api (Api): The API object to use to create the job artifact.
430
432
  build_context (Optional[str]): Path within the job source code to
431
433
  the image build context. Saved as part of the job for future
432
434
  builds.
433
435
  dockerfile (Optional[str]): Path within the build context the
434
436
  Dockerfile. Saved as part of the job for future builds.
437
+ base_image (Optional[str]): The base image used to run the job code.
435
438
 
436
439
  Returns:
437
440
  Optional[Artifact]: The job artifact if it was successfully built,
@@ -467,8 +470,6 @@ class JobBuilder:
467
470
  "warn",
468
471
  )
469
472
  return None
470
- metadata["dockerfile"] = dockerfile
471
- metadata["build_context"] = build_context
472
473
 
473
474
  runtime: Optional[str] = metadata.get("python")
474
475
  # can't build a job without a python version
@@ -520,6 +521,8 @@ class JobBuilder:
520
521
  source["build_context"] = build_context # type: ignore[typeddict-item]
521
522
  if dockerfile:
522
523
  source["dockerfile"] = dockerfile # type: ignore[typeddict-item]
524
+ if base_image:
525
+ source["base_image"] = base_image # type: ignore[typeddict-item]
523
526
 
524
527
  # Pop any keys that are initialized to None. The current TypedDict
525
528
  # system for source dicts requires all keys to be present, but we
@@ -1,5 +1,7 @@
1
1
  """sender."""
2
2
 
3
+ import contextlib
4
+ import gzip
3
5
  import json
4
6
  import logging
5
7
  import os
@@ -66,6 +68,7 @@ else:
66
68
  if TYPE_CHECKING:
67
69
  from wandb.proto.wandb_internal_pb2 import (
68
70
  ArtifactManifest,
71
+ ArtifactManifestEntry,
69
72
  ArtifactRecord,
70
73
  HttpResponse,
71
74
  LocalInfo,
@@ -105,22 +108,18 @@ def _framework_priority() -> Generator[Tuple[str, str], None, None]:
105
108
 
106
109
  def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
107
110
  if manifest.version == 1:
108
- contents = {
109
- content.path: {
110
- "digest": content.digest,
111
- "birthArtifactID": content.birth_artifact_id
112
- if content.birth_artifact_id
113
- else None,
114
- "ref": content.ref if content.ref else None,
115
- "size": content.size if content.size is not None else None,
116
- "local_path": content.local_path if content.local_path else None,
117
- "skip_cache": content.skip_cache,
118
- "extra": {
119
- extra.key: json.loads(extra.value_json) for extra in content.extra
120
- },
111
+ if manifest.manifest_file_path:
112
+ contents = {}
113
+ with gzip.open(manifest.manifest_file_path, "rt") as f:
114
+ for line in f:
115
+ entry_json = json.loads(line)
116
+ path = entry_json.pop("path")
117
+ contents[path] = entry_json
118
+ else:
119
+ contents = {
120
+ content.path: _manifest_entry_from_proto(content)
121
+ for content in manifest.contents
121
122
  }
122
- for content in manifest.contents
123
- }
124
123
  else:
125
124
  raise ValueError(f"unknown artifact manifest version: {manifest.version}")
126
125
 
@@ -135,6 +134,19 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
135
134
  }
136
135
 
137
136
 
137
+ def _manifest_entry_from_proto(entry: "ArtifactManifestEntry") -> Dict:
138
+ birth_artifact_id = entry.birth_artifact_id if entry.birth_artifact_id else None
139
+ return {
140
+ "digest": entry.digest,
141
+ "birthArtifactID": birth_artifact_id,
142
+ "ref": entry.ref if entry.ref else None,
143
+ "size": entry.size if entry.size is not None else None,
144
+ "local_path": entry.local_path if entry.local_path else None,
145
+ "skip_cache": entry.skip_cache,
146
+ "extra": {extra.key: json.loads(extra.value_json) for extra in entry.extra},
147
+ }
148
+
149
+
138
150
  class ResumeState:
139
151
  resumed: bool
140
152
  step: int
@@ -1586,6 +1598,10 @@ class SendManager:
1586
1598
  )
1587
1599
 
1588
1600
  self._job_builder._handle_server_artifact(res, artifact)
1601
+
1602
+ if artifact.manifest.manifest_file_path:
1603
+ with contextlib.suppress(FileNotFoundError):
1604
+ os.remove(artifact.manifest.manifest_file_path)
1589
1605
  return res
1590
1606
 
1591
1607
  def send_alert(self, record: "Record") -> None:
@@ -123,7 +123,7 @@ class TBWatcher:
123
123
  self._force = force
124
124
  # TODO(jhr): do we need locking in this queue?
125
125
  self._watcher_queue = queue.PriorityQueue()
126
- wandb.tensorboard.reset_state()
126
+ wandb.tensorboard.reset_state() # type: ignore
127
127
 
128
128
  def _calculate_namespace(self, logdir: str, rootdir: str) -> Optional[str]:
129
129
  namespace: Optional[str]
@@ -430,7 +430,7 @@ class TBEventConsumer:
430
430
  def _handle_event(
431
431
  self, event: "ProtoEvent", history: Optional["TBHistory"] = None
432
432
  ) -> None:
433
- wandb.tensorboard._log(
433
+ wandb.tensorboard._log( # type: ignore
434
434
  event.event,
435
435
  step=event.event.step,
436
436
  namespace=event.namespace,
@@ -10,7 +10,7 @@ def _find_available(
10
10
  ) -> Optional[Tuple[str, bool, bool, bool, Optional[str]]]:
11
11
  from wandb.util import parse_version
12
12
 
13
- pypi_url = f"https://pypi.org/pypi/{wandb._wandb_module}/json"
13
+ pypi_url = "https://pypi.org/pypi/wandb/json"
14
14
 
15
15
  yanked_dict = {}
16
16
  try:
@@ -78,7 +78,7 @@ def check_available(current_version: str) -> Optional[Dict[str, Optional[str]]]:
78
78
  if not package_info:
79
79
  return None
80
80
 
81
- wandb_module_name = wandb._wandb_module
81
+ wandb_module_name = "wandb"
82
82
 
83
83
  latest_version, pip_prerelease, deleted, yanked, yanked_reason = package_info
84
84
  upgrade_message = (
@@ -211,7 +211,9 @@ async def _launch(
211
211
  launch_project = LaunchProject.from_spec(launch_spec, api)
212
212
  launch_project.fetch_and_validate_project()
213
213
  entrypoint = launch_project.get_job_entry_point()
214
- image_uri = launch_project.docker_image # Either set by user or None.
214
+ image_uri = (
215
+ launch_project.docker_image or launch_project.job_base_image
216
+ ) # Either set by user or None.
215
217
 
216
218
  # construct runner config.
217
219
  runner_config: Dict[str, Any] = {}
@@ -224,7 +226,7 @@ async def _launch(
224
226
  await environment.verify()
225
227
  registry = loader.registry_from_config(registry_config, environment)
226
228
  builder = loader.builder_from_config(build_config, environment, registry)
227
- if not launch_project.docker_image:
229
+ if not (launch_project.docker_image or launch_project.job_base_image):
228
230
  assert entrypoint
229
231
  image_uri = await builder.build_image(launch_project, entrypoint, None)
230
232
  backend = loader.runner_from_config(
@@ -7,6 +7,7 @@ import enum
7
7
  import json
8
8
  import logging
9
9
  import os
10
+ import shutil
10
11
  import tempfile
11
12
  from copy import deepcopy
12
13
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
@@ -112,6 +113,9 @@ class LaunchProject:
112
113
  self.sweep_id = sweep_id
113
114
  self.author = launch_spec.get("author")
114
115
  self.python_version: Optional[str] = launch_spec.get("python_version")
116
+ self._job_dockerfile: Optional[str] = None
117
+ self._job_build_context: Optional[str] = None
118
+ self._job_base_image: Optional[str] = None
115
119
  self.accelerator_base_image: Optional[str] = resource_args_build.get(
116
120
  "accelerator", {}
117
121
  ).get("base_image") or resource_args_build.get("cuda", {}).get("base_image")
@@ -131,8 +135,6 @@ class LaunchProject:
131
135
  self._queue_name: Optional[str] = None
132
136
  self._queue_entity: Optional[str] = None
133
137
  self._run_queue_item_id: Optional[str] = None
134
- self._job_dockerfile: Optional[str] = None
135
- self._job_build_context: Optional[str] = None
136
138
 
137
139
  def init_source(self) -> None:
138
140
  if self.docker_image is not None:
@@ -146,6 +148,21 @@ class LaunchProject:
146
148
  self.project_dir = os.getcwd()
147
149
  self._entry_point = self.override_entrypoint
148
150
 
151
+ def change_project_dir(self, new_dir: str) -> None:
152
+ """Change the project directory to a new directory."""
153
+ # Copy the contents of the old project dir to the new project dir.
154
+ old_dir = self.project_dir
155
+ if old_dir is not None:
156
+ shutil.copytree(
157
+ old_dir,
158
+ new_dir,
159
+ symlinks=True,
160
+ dirs_exist_ok=True,
161
+ ignore=shutil.ignore_patterns("fsmonitor--daemon.ipc", ".git"),
162
+ )
163
+ shutil.rmtree(old_dir)
164
+ self.project_dir = new_dir
165
+
149
166
  def init_git(self, git_info: Dict[str, str]) -> None:
150
167
  self.git_version = git_info.get("version")
151
168
  self.git_repo = git_info.get("repo")
@@ -212,14 +229,23 @@ class LaunchProject:
212
229
  def job_build_context(self) -> Optional[str]:
213
230
  return self._job_build_context
214
231
 
232
+ @property
233
+ def job_base_image(self) -> Optional[str]:
234
+ return self._job_base_image
235
+
215
236
  def set_job_dockerfile(self, dockerfile: str) -> None:
216
237
  self._job_dockerfile = dockerfile
217
238
 
218
239
  def set_job_build_context(self, build_context: str) -> None:
219
240
  self._job_build_context = build_context
220
241
 
242
+ def set_job_base_image(self, base_image: str) -> None:
243
+ self._job_base_image = base_image
244
+
221
245
  @property
222
246
  def image_name(self) -> str:
247
+ if self.job_base_image is not None:
248
+ return self.job_base_image
223
249
  if self.docker_image is not None:
224
250
  return self.docker_image
225
251
  elif self.uri is not None:
@@ -299,10 +325,8 @@ class LaunchProject:
299
325
 
300
326
  def build_required(self) -> bool:
301
327
  """Checks the source to see if a build is required."""
302
- # since the image tag for images built from jobs
303
- # is based on the job version index, which is immutable
304
- # we don't need to build the image for a job if that tag
305
- # already exists
328
+ if self.job_base_image is not None:
329
+ return False
306
330
  if self.source != LaunchSource.JOB:
307
331
  return True
308
332
  return False
@@ -316,7 +340,9 @@ class LaunchProject:
316
340
  Returns:
317
341
  Optional[str]: The Docker image or None if not specified.
318
342
  """
319
- return self._docker_image
343
+ if self._docker_image:
344
+ return self._docker_image
345
+ return None
320
346
 
321
347
  @docker_image.setter
322
348
  def docker_image(self, value: str) -> None:
@@ -336,7 +362,7 @@ class LaunchProject:
336
362
  # assuming project only has 1 entry point, pull that out
337
363
  # tmp fn until we figure out if we want to support multiple entry points or not
338
364
  if not self._entry_point:
339
- if not self.docker_image:
365
+ if not self.docker_image and not self.job_base_image:
340
366
  raise LaunchError(
341
367
  "Project must have at least one entry point unless docker image is specified."
342
368
  )
@@ -717,7 +717,7 @@ class LaunchAgent:
717
717
  _, build_config, registry_config = construct_agent_configs(
718
718
  default_config, override_build_config
719
719
  )
720
- image_uri = project.docker_image
720
+ image_uri = project.docker_image or project.job_base_image
721
721
  entrypoint = project.get_job_entry_point()
722
722
  environment = loader.environment_from_config(
723
723
  default_config.get("environment", {})
@@ -727,7 +727,11 @@ class LaunchAgent:
727
727
  backend = loader.runner_from_config(
728
728
  resource, api, backend_config, environment, registry
729
729
  )
730
- if not (project.docker_image or isinstance(backend, LocalProcessRunner)):
730
+ if not (
731
+ project.docker_image
732
+ or project.job_base_image
733
+ or isinstance(backend, LocalProcessRunner)
734
+ ):
731
735
  assert entrypoint is not None
732
736
  image_uri = await builder.build_image(project, entrypoint, job_tracker)
733
737
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  import os
4
4
  import sys
5
- from typing import List, Optional, Union
5
+ from typing import List, Optional
6
6
 
7
7
  import wandb
8
8
 
@@ -17,9 +17,7 @@ FileSubtypes = Literal["warning", "error"]
17
17
  class RunQueueItemFileSaver:
18
18
  def __init__(
19
19
  self,
20
- agent_run: Optional[
21
- Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
22
- ],
20
+ agent_run: Optional["wandb.sdk.wandb_run.Run"],
23
21
  run_queue_item_id: str,
24
22
  ):
25
23
  self.run_queue_item_id = run_queue_item_id
@@ -201,7 +201,7 @@ def get_requirements_section(
201
201
  # If there is a requirements.txt at root of build context, use that.
202
202
  if (base_path / "src" / "requirements.txt").exists():
203
203
  requirements_files += ["src/requirements.txt"]
204
- deps_install_line = "pip install -r requirements.txt"
204
+ deps_install_line = "pip install uv && uv pip install -r requirements.txt"
205
205
  with open(base_path / "src" / "requirements.txt") as f:
206
206
  requirements = f.readlines()
207
207
  if not any(["wandb" in r for r in requirements]):
@@ -237,7 +237,9 @@ def get_requirements_section(
237
237
  with open(base_path / "src" / "requirements.txt", "w") as f:
238
238
  f.write("\n".join(project_deps))
239
239
  requirements_files += ["src/requirements.txt"]
240
- deps_install_line = "pip install -r requirements.txt"
240
+ deps_install_line = (
241
+ "pip install uv && uv pip install -r requirements.txt"
242
+ )
241
243
  return PIP_TEMPLATE.format(
242
244
  buildx_optional_prefix=prefix,
243
245
  requirements_files=" ".join(requirements_files),
@@ -63,6 +63,13 @@ else:
63
63
  NAMESPACE = "wandb"
64
64
 
65
65
 
66
+ def get_pod_name_safe(job: client.V1Job):
67
+ try:
68
+ return job.spec.template.metadata.name
69
+ except AttributeError:
70
+ return None
71
+
72
+
66
73
  async def _wait_for_completion(
67
74
  batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
68
75
  ) -> bool:
@@ -319,17 +326,18 @@ class KanikoBuilder(AbstractBuilder):
319
326
  await self._create_docker_ecr_config_map(
320
327
  build_job_name, core_v1, repo_uri
321
328
  )
322
- await batch_v1.create_namespaced_job(NAMESPACE, build_job)
323
-
329
+ k8s_job = await batch_v1.create_namespaced_job(NAMESPACE, build_job)
324
330
  # wait for double the job deadline since it might take time to schedule
325
331
  if not await _wait_for_completion(
326
332
  batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
327
333
  ):
328
334
  if job_tracker:
329
335
  job_tracker.set_err_stage("build")
330
- raise Exception(
331
- f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
332
- )
336
+ msg = f"Failed to build image in kaniko for job {run_id}."
337
+ pod_name = get_pod_name_safe(k8s_job)
338
+ if pod_name:
339
+ msg += f" View logs with `kubectl logs -n {NAMESPACE} {pod_name}`."
340
+ raise Exception(msg)
333
341
  try:
334
342
  pods_from_job = await core_v1.list_namespaced_pod(
335
343
  namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
@@ -39,12 +39,13 @@ def install_deps(
39
39
  deps (str[], None): The dependencies that failed to install
40
40
  """
41
41
  try:
42
+ subprocess.check_output(["pip", "install", "uv"], stderr=subprocess.STDOUT)
42
43
  # Include only uri if @ is present
43
44
  clean_deps = [d.split("@")[-1].strip() if "@" in d else d for d in deps]
44
45
  index_args = ["--extra-index-url", extra_index] if extra_index else []
45
46
  print("installing {}...".format(", ".join(clean_deps)))
46
47
  opts = opts or []
47
- args = ["pip", "install"] + opts + clean_deps + index_args
48
+ args = ["uv", "pip", "install"] + opts + clean_deps + index_args
48
49
  sys.stdout.flush()
49
50
  subprocess.check_output(args, stderr=subprocess.STDOUT)
50
51
  return failed
@@ -114,6 +114,7 @@ def _create_job(
114
114
  git_hash: Optional[str] = None,
115
115
  build_context: Optional[str] = None,
116
116
  dockerfile: Optional[str] = None,
117
+ base_image: Optional[str] = None,
117
118
  ) -> Tuple[Optional[Artifact], str, List[str]]:
118
119
  wandb.termlog(f"Creating launch job of type: {job_type}...")
119
120
 
@@ -188,6 +189,7 @@ def _create_job(
188
189
  api.api,
189
190
  dockerfile=dockerfile,
190
191
  build_context=build_context,
192
+ base_image=base_image,
191
193
  )
192
194
  if not artifact:
193
195
  wandb.termerror("JobBuilder failed to build a job")