wandb 0.17.5__py3-none-any.whl → 0.17.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. wandb/__init__.py +5 -16
  2. wandb/agents/pyagent.py +1 -2
  3. wandb/apis/public/api.py +1 -1
  4. wandb/apis/public/jobs.py +5 -0
  5. wandb/bin/nvidia_gpu_stats +0 -0
  6. wandb/cli/cli.py +21 -0
  7. wandb/data_types.py +5 -4
  8. wandb/env.py +6 -0
  9. wandb/integration/kfp/wandb_logging.py +1 -1
  10. wandb/integration/lightning/fabric/logger.py +5 -5
  11. wandb/integration/openai/fine_tuning.py +13 -5
  12. wandb/integration/ultralytics/pose_utils.py +0 -1
  13. wandb/proto/v3/wandb_internal_pb2.py +226 -226
  14. wandb/proto/v3/wandb_settings_pb2.py +1 -1
  15. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  16. wandb/proto/v4/wandb_internal_pb2.py +226 -226
  17. wandb/proto/v4/wandb_settings_pb2.py +1 -1
  18. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  19. wandb/proto/v5/wandb_internal_pb2.py +226 -226
  20. wandb/proto/v5/wandb_settings_pb2.py +1 -1
  21. wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
  22. wandb/proto/wandb_deprecated.py +4 -0
  23. wandb/proto/wandb_internal_pb2.py +6 -0
  24. wandb/sdk/artifacts/artifact.py +6 -1
  25. wandb/sdk/artifacts/artifact_manifest_entry.py +31 -0
  26. wandb/sdk/artifacts/storage_handlers/azure_handler.py +35 -23
  27. wandb/sdk/data_types/_dtypes.py +5 -5
  28. wandb/sdk/data_types/base_types/media.py +3 -1
  29. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
  30. wandb/sdk/data_types/helper_types/image_mask.py +3 -1
  31. wandb/sdk/data_types/image.py +3 -1
  32. wandb/sdk/data_types/object_3d.py +113 -2
  33. wandb/sdk/data_types/saved_model.py +3 -1
  34. wandb/sdk/interface/interface.py +40 -16
  35. wandb/sdk/interface/interface_shared.py +6 -9
  36. wandb/sdk/internal/datastore.py +1 -1
  37. wandb/sdk/internal/handler.py +0 -2
  38. wandb/sdk/internal/internal.py +1 -1
  39. wandb/sdk/internal/job_builder.py +5 -2
  40. wandb/sdk/internal/sender.py +31 -15
  41. wandb/sdk/internal/tb_watcher.py +2 -2
  42. wandb/sdk/internal/update.py +2 -2
  43. wandb/sdk/launch/_launch.py +4 -2
  44. wandb/sdk/launch/_project_spec.py +34 -8
  45. wandb/sdk/launch/agent/agent.py +6 -2
  46. wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -4
  47. wandb/sdk/launch/builder/build.py +4 -2
  48. wandb/sdk/launch/builder/kaniko_builder.py +13 -5
  49. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +2 -1
  50. wandb/sdk/launch/create_job.py +2 -0
  51. wandb/sdk/launch/inputs/internal.py +42 -28
  52. wandb/sdk/launch/inputs/schema.py +39 -0
  53. wandb/sdk/launch/runner/kubernetes_runner.py +72 -0
  54. wandb/sdk/launch/runner/local_container.py +13 -10
  55. wandb/sdk/launch/runner/sagemaker_runner.py +3 -5
  56. wandb/sdk/launch/utils.py +2 -0
  57. wandb/sdk/lib/apikey.py +1 -1
  58. wandb/sdk/lib/disabled.py +13 -174
  59. wandb/sdk/service/streams.py +2 -4
  60. wandb/sdk/wandb_config.py +1 -1
  61. wandb/sdk/wandb_init.py +77 -33
  62. wandb/sdk/wandb_login.py +6 -6
  63. wandb/sdk/wandb_run.py +150 -90
  64. wandb/sdk/wandb_settings.py +4 -3
  65. wandb/sdk/wandb_setup.py +66 -3
  66. wandb/sdk/wandb_sweep.py +5 -2
  67. wandb/wandb_agent.py +2 -0
  68. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/METADATA +3 -2
  69. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/RECORD +72 -70
  70. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/WHEEL +0 -0
  71. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/entry_points.txt +0 -0
  72. {wandb-0.17.5.dist-info → wandb-0.17.7.dist-info}/licenses/LICENSE +0 -0
@@ -8,11 +8,14 @@ InterfaceRelay: Responses are routed to a relay queue (not matching uuids)
8
8
 
9
9
  """
10
10
 
11
+ import gzip
11
12
  import logging
12
13
  import os
13
14
  import sys
14
15
  import time
15
16
  from abc import abstractmethod
17
+ from pathlib import Path
18
+ from secrets import token_hex
16
19
  from typing import (
17
20
  TYPE_CHECKING,
18
21
  Any,
@@ -47,20 +50,24 @@ from ..lib.mailbox import MailboxHandle
47
50
  from . import summary_record as sr
48
51
  from .message_future import MessageFuture
49
52
 
53
+ MANIFEST_FILE_SIZE_THRESHOLD = 100_000
54
+
50
55
  GlobStr = NewType("GlobStr", str)
51
56
 
52
- if TYPE_CHECKING:
53
- from ..wandb_run import Run
57
+ if sys.version_info >= (3, 8):
58
+ from typing import Literal, TypedDict
59
+ else:
60
+ from typing_extensions import Literal, TypedDict
54
61
 
55
- if sys.version_info >= (3, 8):
56
- from typing import Literal, TypedDict
57
- else:
58
- from typing_extensions import Literal, TypedDict
62
+ PolicyName = Literal["now", "live", "end"]
59
63
 
60
- PolicyName = Literal["now", "live", "end"]
61
64
 
62
- class FilesDict(TypedDict):
63
- files: Iterable[Tuple[GlobStr, PolicyName]]
65
+ class FilesDict(TypedDict):
66
+ files: Iterable[Tuple[GlobStr, PolicyName]]
67
+
68
+
69
+ if TYPE_CHECKING:
70
+ from ..wandb_run import Run
64
71
 
65
72
 
66
73
  logger = logging.getLogger("wandb")
@@ -107,15 +114,14 @@ class InterfaceBase:
107
114
  def _publish_header(self, header: pb.HeaderRecord) -> None:
108
115
  raise NotImplementedError
109
116
 
110
- def communicate_status(self) -> Optional[pb.StatusResponse]:
111
- status = pb.StatusRequest()
112
- resp = self._communicate_status(status)
113
- return resp
117
+ def deliver_status(self) -> MailboxHandle:
118
+ return self._deliver_status(pb.StatusRequest())
114
119
 
115
120
  @abstractmethod
116
- def _communicate_status(
117
- self, status: pb.StatusRequest
118
- ) -> Optional[pb.StatusResponse]:
121
+ def _deliver_status(
122
+ self,
123
+ status: pb.StatusRequest,
124
+ ) -> MailboxHandle:
119
125
  raise NotImplementedError
120
126
 
121
127
  def _make_config(
@@ -334,6 +340,12 @@ class InterfaceBase:
334
340
  proto_manifest.version = artifact_manifest.version()
335
341
  proto_manifest.storage_policy = artifact_manifest.storage_policy.name()
336
342
 
343
+ # Very large manifests need to be written to file to avoid protobuf size limits.
344
+ if len(artifact_manifest) > MANIFEST_FILE_SIZE_THRESHOLD:
345
+ path = self._write_artifact_manifest_file(artifact_manifest)
346
+ proto_manifest.manifest_file_path = path
347
+ return proto_manifest
348
+
337
349
  for k, v in artifact_manifest.storage_policy.config().items() or {}.items():
338
350
  cfg = proto_manifest.storage_policy_config.add()
339
351
  cfg.key = k
@@ -358,6 +370,18 @@ class InterfaceBase:
358
370
  proto_extra.value_json = json.dumps(v)
359
371
  return proto_manifest
360
372
 
373
+ def _write_artifact_manifest_file(self, manifest: ArtifactManifest) -> str:
374
+ manifest_dir = Path(get_staging_dir()) / "artifact_manifests"
375
+ manifest_dir.mkdir(parents=True, exist_ok=True)
376
+ # It would be simpler to use `manifest.to_json()`, but that gets very slow for
377
+ # large manifests since it encodes the whole thing as a single JSON object.
378
+ filename = f"{time.time()}_{token_hex(8)}.manifest_contents.jl.gz"
379
+ manifest_file_path = manifest_dir / filename
380
+ with gzip.open(manifest_file_path, mode="wt", compresslevel=1) as f:
381
+ for entry in manifest.entries.values():
382
+ f.write(f"{json.dumps(entry.to_json())}\n")
383
+ return str(manifest_file_path)
384
+
361
385
  def deliver_link_artifact(
362
386
  self,
363
387
  run: "Run",
@@ -299,7 +299,7 @@ class InterfaceShared(InterfaceBase):
299
299
  raise NotImplementedError
300
300
 
301
301
  def _communicate(
302
- self, rec: pb.Record, timeout: Optional[int] = 5, local: Optional[bool] = None
302
+ self, rec: pb.Record, timeout: Optional[int] = 30, local: Optional[bool] = None
303
303
  ) -> Optional[pb.Result]:
304
304
  return self._communicate_async(rec, local=local).get(timeout=timeout)
305
305
 
@@ -421,15 +421,12 @@ class InterfaceShared(InterfaceBase):
421
421
  rec = self._make_record(alert=proto_alert)
422
422
  self._publish(rec)
423
423
 
424
- def _communicate_status(
425
- self, status: pb.StatusRequest
426
- ) -> Optional[pb.StatusResponse]:
424
+ def _deliver_status(
425
+ self,
426
+ status: pb.StatusRequest,
427
+ ) -> MailboxHandle:
427
428
  req = self._make_request(status=status)
428
- resp = self._communicate(req, local=True)
429
- if resp is None:
430
- return None
431
- assert resp.response.status_response
432
- return resp.response.status_response
429
+ return self._deliver_record(req)
433
430
 
434
431
  def _publish_exit(self, exit_data: pb.RunExitRecord) -> None:
435
432
  rec = self._make_record(exit=exit_data)
@@ -69,7 +69,7 @@ class DataStore:
69
69
 
70
70
  def __init__(self) -> None:
71
71
  self._opened_for_scan = False
72
- self._fp: Optional["IO[Any]"] = None
72
+ self._fp: Optional[IO[Any]] = None
73
73
  self._index = 0
74
74
  self._flush_offset = 0
75
75
  self._size_bytes = 0
@@ -745,8 +745,6 @@ class HandleManager:
745
745
  self._respond_result(result)
746
746
 
747
747
  def handle_request_status(self, record: Record) -> None:
748
- # TODO(mempressure): do something better?
749
- assert record.control.req_resp
750
748
  result = proto_util._result_from_record(record)
751
749
  self._respond_result(result)
752
750
 
@@ -62,7 +62,7 @@ def wandb_internal(
62
62
 
63
63
  """
64
64
  # mark this process as internal
65
- wandb._set_internal_process()
65
+ wandb._set_internal_process() # type: ignore
66
66
  _setup_tracelog()
67
67
  started = time.time()
68
68
 
@@ -423,15 +423,18 @@ class JobBuilder:
423
423
  api: Api,
424
424
  build_context: Optional[str] = None,
425
425
  dockerfile: Optional[str] = None,
426
+ base_image: Optional[str] = None,
426
427
  ) -> Optional[Artifact]:
427
428
  """Build a job artifact from the current run.
428
429
 
429
430
  Arguments:
431
+ api (Api): The API object to use to create the job artifact.
430
432
  build_context (Optional[str]): Path within the job source code to
431
433
  the image build context. Saved as part of the job for future
432
434
  builds.
433
435
  dockerfile (Optional[str]): Path within the build context the
434
436
  Dockerfile. Saved as part of the job for future builds.
437
+ base_image (Optional[str]): The base image used to run the job code.
435
438
 
436
439
  Returns:
437
440
  Optional[Artifact]: The job artifact if it was successfully built,
@@ -467,8 +470,6 @@ class JobBuilder:
467
470
  "warn",
468
471
  )
469
472
  return None
470
- metadata["dockerfile"] = dockerfile
471
- metadata["build_context"] = build_context
472
473
 
473
474
  runtime: Optional[str] = metadata.get("python")
474
475
  # can't build a job without a python version
@@ -520,6 +521,8 @@ class JobBuilder:
520
521
  source["build_context"] = build_context # type: ignore[typeddict-item]
521
522
  if dockerfile:
522
523
  source["dockerfile"] = dockerfile # type: ignore[typeddict-item]
524
+ if base_image:
525
+ source["base_image"] = base_image # type: ignore[typeddict-item]
523
526
 
524
527
  # Pop any keys that are initialized to None. The current TypedDict
525
528
  # system for source dicts requires all keys to be present, but we
@@ -1,5 +1,7 @@
1
1
  """sender."""
2
2
 
3
+ import contextlib
4
+ import gzip
3
5
  import json
4
6
  import logging
5
7
  import os
@@ -66,6 +68,7 @@ else:
66
68
  if TYPE_CHECKING:
67
69
  from wandb.proto.wandb_internal_pb2 import (
68
70
  ArtifactManifest,
71
+ ArtifactManifestEntry,
69
72
  ArtifactRecord,
70
73
  HttpResponse,
71
74
  LocalInfo,
@@ -105,22 +108,18 @@ def _framework_priority() -> Generator[Tuple[str, str], None, None]:
105
108
 
106
109
  def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
107
110
  if manifest.version == 1:
108
- contents = {
109
- content.path: {
110
- "digest": content.digest,
111
- "birthArtifactID": content.birth_artifact_id
112
- if content.birth_artifact_id
113
- else None,
114
- "ref": content.ref if content.ref else None,
115
- "size": content.size if content.size is not None else None,
116
- "local_path": content.local_path if content.local_path else None,
117
- "skip_cache": content.skip_cache,
118
- "extra": {
119
- extra.key: json.loads(extra.value_json) for extra in content.extra
120
- },
111
+ if manifest.manifest_file_path:
112
+ contents = {}
113
+ with gzip.open(manifest.manifest_file_path, "rt") as f:
114
+ for line in f:
115
+ entry_json = json.loads(line)
116
+ path = entry_json.pop("path")
117
+ contents[path] = entry_json
118
+ else:
119
+ contents = {
120
+ content.path: _manifest_entry_from_proto(content)
121
+ for content in manifest.contents
121
122
  }
122
- for content in manifest.contents
123
- }
124
123
  else:
125
124
  raise ValueError(f"unknown artifact manifest version: {manifest.version}")
126
125
 
@@ -135,6 +134,19 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
135
134
  }
136
135
 
137
136
 
137
+ def _manifest_entry_from_proto(entry: "ArtifactManifestEntry") -> Dict:
138
+ birth_artifact_id = entry.birth_artifact_id if entry.birth_artifact_id else None
139
+ return {
140
+ "digest": entry.digest,
141
+ "birthArtifactID": birth_artifact_id,
142
+ "ref": entry.ref if entry.ref else None,
143
+ "size": entry.size if entry.size is not None else None,
144
+ "local_path": entry.local_path if entry.local_path else None,
145
+ "skip_cache": entry.skip_cache,
146
+ "extra": {extra.key: json.loads(extra.value_json) for extra in entry.extra},
147
+ }
148
+
149
+
138
150
  class ResumeState:
139
151
  resumed: bool
140
152
  step: int
@@ -1586,6 +1598,10 @@ class SendManager:
1586
1598
  )
1587
1599
 
1588
1600
  self._job_builder._handle_server_artifact(res, artifact)
1601
+
1602
+ if artifact.manifest.manifest_file_path:
1603
+ with contextlib.suppress(FileNotFoundError):
1604
+ os.remove(artifact.manifest.manifest_file_path)
1589
1605
  return res
1590
1606
 
1591
1607
  def send_alert(self, record: "Record") -> None:
@@ -123,7 +123,7 @@ class TBWatcher:
123
123
  self._force = force
124
124
  # TODO(jhr): do we need locking in this queue?
125
125
  self._watcher_queue = queue.PriorityQueue()
126
- wandb.tensorboard.reset_state()
126
+ wandb.tensorboard.reset_state() # type: ignore
127
127
 
128
128
  def _calculate_namespace(self, logdir: str, rootdir: str) -> Optional[str]:
129
129
  namespace: Optional[str]
@@ -430,7 +430,7 @@ class TBEventConsumer:
430
430
  def _handle_event(
431
431
  self, event: "ProtoEvent", history: Optional["TBHistory"] = None
432
432
  ) -> None:
433
- wandb.tensorboard._log(
433
+ wandb.tensorboard._log( # type: ignore
434
434
  event.event,
435
435
  step=event.event.step,
436
436
  namespace=event.namespace,
@@ -10,7 +10,7 @@ def _find_available(
10
10
  ) -> Optional[Tuple[str, bool, bool, bool, Optional[str]]]:
11
11
  from wandb.util import parse_version
12
12
 
13
- pypi_url = f"https://pypi.org/pypi/{wandb._wandb_module}/json"
13
+ pypi_url = "https://pypi.org/pypi/wandb/json"
14
14
 
15
15
  yanked_dict = {}
16
16
  try:
@@ -78,7 +78,7 @@ def check_available(current_version: str) -> Optional[Dict[str, Optional[str]]]:
78
78
  if not package_info:
79
79
  return None
80
80
 
81
- wandb_module_name = wandb._wandb_module
81
+ wandb_module_name = "wandb"
82
82
 
83
83
  latest_version, pip_prerelease, deleted, yanked, yanked_reason = package_info
84
84
  upgrade_message = (
@@ -211,7 +211,9 @@ async def _launch(
211
211
  launch_project = LaunchProject.from_spec(launch_spec, api)
212
212
  launch_project.fetch_and_validate_project()
213
213
  entrypoint = launch_project.get_job_entry_point()
214
- image_uri = launch_project.docker_image # Either set by user or None.
214
+ image_uri = (
215
+ launch_project.docker_image or launch_project.job_base_image
216
+ ) # Either set by user or None.
215
217
 
216
218
  # construct runner config.
217
219
  runner_config: Dict[str, Any] = {}
@@ -224,7 +226,7 @@ async def _launch(
224
226
  await environment.verify()
225
227
  registry = loader.registry_from_config(registry_config, environment)
226
228
  builder = loader.builder_from_config(build_config, environment, registry)
227
- if not launch_project.docker_image:
229
+ if not (launch_project.docker_image or launch_project.job_base_image):
228
230
  assert entrypoint
229
231
  image_uri = await builder.build_image(launch_project, entrypoint, None)
230
232
  backend = loader.runner_from_config(
@@ -7,6 +7,7 @@ import enum
7
7
  import json
8
8
  import logging
9
9
  import os
10
+ import shutil
10
11
  import tempfile
11
12
  from copy import deepcopy
12
13
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
@@ -112,6 +113,9 @@ class LaunchProject:
112
113
  self.sweep_id = sweep_id
113
114
  self.author = launch_spec.get("author")
114
115
  self.python_version: Optional[str] = launch_spec.get("python_version")
116
+ self._job_dockerfile: Optional[str] = None
117
+ self._job_build_context: Optional[str] = None
118
+ self._job_base_image: Optional[str] = None
115
119
  self.accelerator_base_image: Optional[str] = resource_args_build.get(
116
120
  "accelerator", {}
117
121
  ).get("base_image") or resource_args_build.get("cuda", {}).get("base_image")
@@ -131,8 +135,6 @@ class LaunchProject:
131
135
  self._queue_name: Optional[str] = None
132
136
  self._queue_entity: Optional[str] = None
133
137
  self._run_queue_item_id: Optional[str] = None
134
- self._job_dockerfile: Optional[str] = None
135
- self._job_build_context: Optional[str] = None
136
138
 
137
139
  def init_source(self) -> None:
138
140
  if self.docker_image is not None:
@@ -146,6 +148,21 @@ class LaunchProject:
146
148
  self.project_dir = os.getcwd()
147
149
  self._entry_point = self.override_entrypoint
148
150
 
151
+ def change_project_dir(self, new_dir: str) -> None:
152
+ """Change the project directory to a new directory."""
153
+ # Copy the contents of the old project dir to the new project dir.
154
+ old_dir = self.project_dir
155
+ if old_dir is not None:
156
+ shutil.copytree(
157
+ old_dir,
158
+ new_dir,
159
+ symlinks=True,
160
+ dirs_exist_ok=True,
161
+ ignore=shutil.ignore_patterns("fsmonitor--daemon.ipc", ".git"),
162
+ )
163
+ shutil.rmtree(old_dir)
164
+ self.project_dir = new_dir
165
+
149
166
  def init_git(self, git_info: Dict[str, str]) -> None:
150
167
  self.git_version = git_info.get("version")
151
168
  self.git_repo = git_info.get("repo")
@@ -212,14 +229,23 @@ class LaunchProject:
212
229
  def job_build_context(self) -> Optional[str]:
213
230
  return self._job_build_context
214
231
 
232
+ @property
233
+ def job_base_image(self) -> Optional[str]:
234
+ return self._job_base_image
235
+
215
236
  def set_job_dockerfile(self, dockerfile: str) -> None:
216
237
  self._job_dockerfile = dockerfile
217
238
 
218
239
  def set_job_build_context(self, build_context: str) -> None:
219
240
  self._job_build_context = build_context
220
241
 
242
+ def set_job_base_image(self, base_image: str) -> None:
243
+ self._job_base_image = base_image
244
+
221
245
  @property
222
246
  def image_name(self) -> str:
247
+ if self.job_base_image is not None:
248
+ return self.job_base_image
223
249
  if self.docker_image is not None:
224
250
  return self.docker_image
225
251
  elif self.uri is not None:
@@ -299,10 +325,8 @@ class LaunchProject:
299
325
 
300
326
  def build_required(self) -> bool:
301
327
  """Checks the source to see if a build is required."""
302
- # since the image tag for images built from jobs
303
- # is based on the job version index, which is immutable
304
- # we don't need to build the image for a job if that tag
305
- # already exists
328
+ if self.job_base_image is not None:
329
+ return False
306
330
  if self.source != LaunchSource.JOB:
307
331
  return True
308
332
  return False
@@ -316,7 +340,9 @@ class LaunchProject:
316
340
  Returns:
317
341
  Optional[str]: The Docker image or None if not specified.
318
342
  """
319
- return self._docker_image
343
+ if self._docker_image:
344
+ return self._docker_image
345
+ return None
320
346
 
321
347
  @docker_image.setter
322
348
  def docker_image(self, value: str) -> None:
@@ -336,7 +362,7 @@ class LaunchProject:
336
362
  # assuming project only has 1 entry point, pull that out
337
363
  # tmp fn until we figure out if we want to support multiple entry points or not
338
364
  if not self._entry_point:
339
- if not self.docker_image:
365
+ if not self.docker_image and not self.job_base_image:
340
366
  raise LaunchError(
341
367
  "Project must have at least one entry point unless docker image is specified."
342
368
  )
@@ -717,7 +717,7 @@ class LaunchAgent:
717
717
  _, build_config, registry_config = construct_agent_configs(
718
718
  default_config, override_build_config
719
719
  )
720
- image_uri = project.docker_image
720
+ image_uri = project.docker_image or project.job_base_image
721
721
  entrypoint = project.get_job_entry_point()
722
722
  environment = loader.environment_from_config(
723
723
  default_config.get("environment", {})
@@ -727,7 +727,11 @@ class LaunchAgent:
727
727
  backend = loader.runner_from_config(
728
728
  resource, api, backend_config, environment, registry
729
729
  )
730
- if not (project.docker_image or isinstance(backend, LocalProcessRunner)):
730
+ if not (
731
+ project.docker_image
732
+ or project.job_base_image
733
+ or isinstance(backend, LocalProcessRunner)
734
+ ):
731
735
  assert entrypoint is not None
732
736
  image_uri = await builder.build_image(project, entrypoint, job_tracker)
733
737
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  import os
4
4
  import sys
5
- from typing import List, Optional, Union
5
+ from typing import List, Optional
6
6
 
7
7
  import wandb
8
8
 
@@ -17,9 +17,7 @@ FileSubtypes = Literal["warning", "error"]
17
17
  class RunQueueItemFileSaver:
18
18
  def __init__(
19
19
  self,
20
- agent_run: Optional[
21
- Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
22
- ],
20
+ agent_run: Optional["wandb.sdk.wandb_run.Run"],
23
21
  run_queue_item_id: str,
24
22
  ):
25
23
  self.run_queue_item_id = run_queue_item_id
@@ -201,7 +201,7 @@ def get_requirements_section(
201
201
  # If there is a requirements.txt at root of build context, use that.
202
202
  if (base_path / "src" / "requirements.txt").exists():
203
203
  requirements_files += ["src/requirements.txt"]
204
- deps_install_line = "pip install -r requirements.txt"
204
+ deps_install_line = "pip install uv && uv pip install -r requirements.txt"
205
205
  with open(base_path / "src" / "requirements.txt") as f:
206
206
  requirements = f.readlines()
207
207
  if not any(["wandb" in r for r in requirements]):
@@ -237,7 +237,9 @@ def get_requirements_section(
237
237
  with open(base_path / "src" / "requirements.txt", "w") as f:
238
238
  f.write("\n".join(project_deps))
239
239
  requirements_files += ["src/requirements.txt"]
240
- deps_install_line = "pip install -r requirements.txt"
240
+ deps_install_line = (
241
+ "pip install uv && uv pip install -r requirements.txt"
242
+ )
241
243
  return PIP_TEMPLATE.format(
242
244
  buildx_optional_prefix=prefix,
243
245
  requirements_files=" ".join(requirements_files),
@@ -63,6 +63,13 @@ else:
63
63
  NAMESPACE = "wandb"
64
64
 
65
65
 
66
+ def get_pod_name_safe(job: client.V1Job):
67
+ try:
68
+ return job.spec.template.metadata.name
69
+ except AttributeError:
70
+ return None
71
+
72
+
66
73
  async def _wait_for_completion(
67
74
  batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
68
75
  ) -> bool:
@@ -319,17 +326,18 @@ class KanikoBuilder(AbstractBuilder):
319
326
  await self._create_docker_ecr_config_map(
320
327
  build_job_name, core_v1, repo_uri
321
328
  )
322
- await batch_v1.create_namespaced_job(NAMESPACE, build_job)
323
-
329
+ k8s_job = await batch_v1.create_namespaced_job(NAMESPACE, build_job)
324
330
  # wait for double the job deadline since it might take time to schedule
325
331
  if not await _wait_for_completion(
326
332
  batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
327
333
  ):
328
334
  if job_tracker:
329
335
  job_tracker.set_err_stage("build")
330
- raise Exception(
331
- f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
332
- )
336
+ msg = f"Failed to build image in kaniko for job {run_id}."
337
+ pod_name = get_pod_name_safe(k8s_job)
338
+ if pod_name:
339
+ msg += f" View logs with `kubectl logs -n {NAMESPACE} {pod_name}`."
340
+ raise Exception(msg)
333
341
  try:
334
342
  pods_from_job = await core_v1.list_namespaced_pod(
335
343
  namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
@@ -39,12 +39,13 @@ def install_deps(
39
39
  deps (str[], None): The dependencies that failed to install
40
40
  """
41
41
  try:
42
+ subprocess.check_output(["pip", "install", "uv"], stderr=subprocess.STDOUT)
42
43
  # Include only uri if @ is present
43
44
  clean_deps = [d.split("@")[-1].strip() if "@" in d else d for d in deps]
44
45
  index_args = ["--extra-index-url", extra_index] if extra_index else []
45
46
  print("installing {}...".format(", ".join(clean_deps)))
46
47
  opts = opts or []
47
- args = ["pip", "install"] + opts + clean_deps + index_args
48
+ args = ["uv", "pip", "install"] + opts + clean_deps + index_args
48
49
  sys.stdout.flush()
49
50
  subprocess.check_output(args, stderr=subprocess.STDOUT)
50
51
  return failed
@@ -114,6 +114,7 @@ def _create_job(
114
114
  git_hash: Optional[str] = None,
115
115
  build_context: Optional[str] = None,
116
116
  dockerfile: Optional[str] = None,
117
+ base_image: Optional[str] = None,
117
118
  ) -> Tuple[Optional[Artifact], str, List[str]]:
118
119
  wandb.termlog(f"Creating launch job of type: {job_type}...")
119
120
 
@@ -188,6 +189,7 @@ def _create_job(
188
189
  api.api,
189
190
  dockerfile=dockerfile,
190
191
  build_context=build_context,
192
+ base_image=base_image,
191
193
  )
192
194
  if not artifact:
193
195
  wandb.termerror("JobBuilder failed to build a job")