wandb 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. wandb/__init__.py +2 -2
  2. wandb/agents/pyagent.py +1 -1
  3. wandb/apis/public/api.py +6 -6
  4. wandb/apis/reports/v2/interface.py +4 -8
  5. wandb/apis/reports/v2/internal.py +12 -45
  6. wandb/cli/cli.py +29 -5
  7. wandb/integration/openai/fine_tuning.py +74 -37
  8. wandb/integration/ultralytics/callback.py +0 -1
  9. wandb/proto/v3/wandb_internal_pb2.py +332 -312
  10. wandb/proto/v3/wandb_settings_pb2.py +13 -3
  11. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  12. wandb/proto/v4/wandb_internal_pb2.py +316 -312
  13. wandb/proto/v4/wandb_settings_pb2.py +5 -3
  14. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  15. wandb/sdk/artifacts/artifact.py +92 -26
  16. wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
  17. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
  18. wandb/sdk/artifacts/artifact_saver.py +16 -36
  19. wandb/sdk/artifacts/storage_handler.py +2 -1
  20. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +13 -5
  21. wandb/sdk/interface/interface.py +60 -15
  22. wandb/sdk/interface/interface_shared.py +13 -7
  23. wandb/sdk/internal/file_stream.py +19 -0
  24. wandb/sdk/internal/handler.py +1 -4
  25. wandb/sdk/internal/internal_api.py +2 -0
  26. wandb/sdk/internal/job_builder.py +45 -17
  27. wandb/sdk/internal/sender.py +53 -28
  28. wandb/sdk/internal/settings_static.py +9 -0
  29. wandb/sdk/internal/system/system_info.py +4 -1
  30. wandb/sdk/launch/_launch.py +5 -0
  31. wandb/sdk/launch/_project_spec.py +5 -20
  32. wandb/sdk/launch/agent/agent.py +80 -37
  33. wandb/sdk/launch/agent/config.py +8 -0
  34. wandb/sdk/launch/builder/kaniko_builder.py +149 -134
  35. wandb/sdk/launch/create_job.py +44 -48
  36. wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
  37. wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
  38. wandb/sdk/launch/sweeps/scheduler.py +3 -1
  39. wandb/sdk/launch/utils.py +23 -5
  40. wandb/sdk/lib/__init__.py +2 -5
  41. wandb/sdk/lib/_settings_toposort_generated.py +2 -0
  42. wandb/sdk/lib/filesystem.py +11 -1
  43. wandb/sdk/lib/run_moment.py +78 -0
  44. wandb/sdk/service/streams.py +1 -6
  45. wandb/sdk/wandb_init.py +12 -7
  46. wandb/sdk/wandb_login.py +43 -26
  47. wandb/sdk/wandb_run.py +179 -94
  48. wandb/sdk/wandb_settings.py +55 -16
  49. wandb/testing/relay.py +5 -6
  50. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
  51. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/RECORD +55 -54
  52. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/WHEEL +1 -1
  53. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
  54. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
  55. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
@@ -13,8 +13,19 @@ import os
13
13
  import sys
14
14
  import time
15
15
  from abc import abstractmethod
16
- from typing import TYPE_CHECKING, Any, Dict, Iterable, NewType, Optional, Tuple, Union
16
+ from typing import (
17
+ TYPE_CHECKING,
18
+ Any,
19
+ Dict,
20
+ Iterable,
21
+ List,
22
+ NewType,
23
+ Optional,
24
+ Tuple,
25
+ Union,
26
+ )
17
27
 
28
+ from wandb import termwarn
18
29
  from wandb.proto import wandb_internal_pb2 as pb
19
30
  from wandb.proto import wandb_telemetry_pb2 as tpb
20
31
  from wandb.sdk.artifacts.artifact import Artifact
@@ -340,6 +351,7 @@ class InterfaceBase:
340
351
  proto_entry.ref = entry.ref
341
352
  if entry.local_path:
342
353
  proto_entry.local_path = entry.local_path
354
+ proto_entry.skip_cache = entry.skip_cache
343
355
  for k, v in entry.extra.items():
344
356
  proto_extra = proto_entry.extra.add()
345
357
  proto_extra.key = k
@@ -436,16 +448,27 @@ class InterfaceBase:
436
448
  path = artifact.get_entry("wandb-job.json").download()
437
449
  with open(path) as f:
438
450
  job_info = json.load(f)
451
+
439
452
  except Exception as e:
440
453
  logger.warning(
441
454
  f"Failed to download partial job info from artifact {artifact}, : {e}"
442
455
  )
443
- use_artifact = self._make_proto_use_artifact(
444
- use_artifact=use_artifact,
445
- job_name=artifact.name,
446
- job_info=job_info,
447
- metadata=artifact.metadata,
448
- )
456
+ termwarn(
457
+ f"Failed to download partial job info from artifact {artifact}, : {e}"
458
+ )
459
+ return
460
+
461
+ try:
462
+ use_artifact = self._make_proto_use_artifact(
463
+ use_artifact=use_artifact,
464
+ job_name=artifact.name,
465
+ job_info=job_info,
466
+ metadata=artifact.metadata,
467
+ )
468
+ except Exception as e:
469
+ logger.warning(f"Failed to construct use artifact proto: {e}")
470
+ termwarn(f"Failed to construct use artifact proto: {e}")
471
+ return
449
472
 
450
473
  self._publish_use_artifact(use_artifact)
451
474
 
@@ -756,6 +779,36 @@ class InterfaceBase:
756
779
  run_start.run.CopyFrom(run_pb)
757
780
  return self._deliver_run_start(run_start)
758
781
 
782
+ def publish_launch_wandb_config_parameters(
783
+ self, include_paths: List[List[str]], exclude_paths: List[List[str]]
784
+ ):
785
+ """Tells the internal process to treat wandb.config fields as job inputs.
786
+
787
+ The paths provided as arguments are sequences of dictionary keys that
788
+ specify a path within the wandb.config. If a path is included, the
789
+ corresponding field will be treated as a job input. If a path is
790
+ excluded, the corresponding field will not be treated as a job input.
791
+
792
+ Args:
793
+ include_paths: paths within config to include as job inputs.
794
+ exclude_paths: paths within config to exclude as job inputs.
795
+
796
+ Returns:
797
+ None
798
+ """
799
+ config_parameters = pb.LaunchWandbConfigParametersRecord()
800
+ include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
801
+ exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
802
+ config_parameters.include_paths.extend(include_records)
803
+ config_parameters.exclude_paths.extend(exclude_records)
804
+ return self._publish_launch_wandb_config_parameters(config_parameters)
805
+
806
+ @abstractmethod
807
+ def _publish_launch_wandb_config_parameters(
808
+ self, config_parameters: pb.LaunchWandbConfigParametersRecord
809
+ ) -> None:
810
+ raise NotImplementedError
811
+
759
812
  @abstractmethod
760
813
  def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
761
814
  raise NotImplementedError
@@ -871,11 +924,3 @@ class InterfaceBase:
871
924
  self, run_status: pb.RunStatusRequest
872
925
  ) -> MailboxHandle:
873
926
  raise NotImplementedError
874
-
875
- def deliver_request_job_info(self) -> MailboxHandle:
876
- job_info = pb.JobInfoRequest()
877
- return self._deliver_request_job_info(job_info)
878
-
879
- @abstractmethod
880
- def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
881
- raise NotImplementedError
@@ -145,7 +145,6 @@ class InterfaceShared(InterfaceBase):
145
145
  cancel: Optional[pb.CancelRequest] = None,
146
146
  summary_record: Optional[pb.SummaryRecordRequest] = None,
147
147
  telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
148
- job_info: Optional[pb.JobInfoRequest] = None,
149
148
  get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
150
149
  python_packages: Optional[pb.PythonPackagesRequest] = None,
151
150
  ) -> pb.Record:
@@ -202,8 +201,6 @@ class InterfaceShared(InterfaceBase):
202
201
  request.summary_record.CopyFrom(summary_record)
203
202
  elif telemetry_record:
204
203
  request.telemetry_record.CopyFrom(telemetry_record)
205
- elif job_info:
206
- request.job_info.CopyFrom(job_info)
207
204
  elif get_system_metrics:
208
205
  request.get_system_metrics.CopyFrom(get_system_metrics)
209
206
  elif sync:
@@ -242,6 +239,9 @@ class InterfaceShared(InterfaceBase):
242
239
  use_artifact: Optional[pb.UseArtifactRecord] = None,
243
240
  output: Optional[pb.OutputRecord] = None,
244
241
  output_raw: Optional[pb.OutputRawRecord] = None,
242
+ launch_wandb_config_parameters: Optional[
243
+ pb.LaunchWandbConfigParametersRecord
244
+ ] = None,
245
245
  ) -> pb.Record:
246
246
  record = pb.Record()
247
247
  if run:
@@ -286,6 +286,8 @@ class InterfaceShared(InterfaceBase):
286
286
  record.output.CopyFrom(output)
287
287
  elif output_raw:
288
288
  record.output_raw.CopyFrom(output_raw)
289
+ elif launch_wandb_config_parameters:
290
+ record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
289
291
  else:
290
292
  raise Exception("Invalid record")
291
293
  return record
@@ -415,6 +417,14 @@ class InterfaceShared(InterfaceBase):
415
417
  rec = self._make_record(alert=proto_alert)
416
418
  self._publish(rec)
417
419
 
420
+ def _publish_launch_wandb_config_parameters(
421
+ self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
422
+ ) -> None:
423
+ rec = self._make_record(
424
+ launch_wandb_config_parameters=launch_wandb_config_parameters
425
+ )
426
+ self._publish(rec)
427
+
418
428
  def _communicate_status(
419
429
  self, status: pb.StatusRequest
420
430
  ) -> Optional[pb.StatusResponse]:
@@ -523,10 +533,6 @@ class InterfaceShared(InterfaceBase):
523
533
  record = self._make_request(run_status=run_status)
524
534
  return self._deliver_record(record)
525
535
 
526
- def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
527
- record = self._make_request(job_info=job_info)
528
- return self._deliver_record(record)
529
-
530
536
  def _transport_keepalive_failed(self, keepalive_interval: int = 5) -> bool:
531
537
  if self._transport_failed:
532
538
  return True
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import functools
3
3
  import itertools
4
+ import json
4
5
  import logging
5
6
  import os
6
7
  import queue
@@ -58,6 +59,7 @@ class Chunk(NamedTuple):
58
59
  class DefaultFilePolicy:
59
60
  def __init__(self, start_chunk_id: int = 0) -> None:
60
61
  self._chunk_id = start_chunk_id
62
+ self.has_debug_log = False
61
63
 
62
64
  def process_chunks(
63
65
  self, chunks: List[Chunk]
@@ -66,6 +68,21 @@ class DefaultFilePolicy:
66
68
  self._chunk_id += len(chunks)
67
69
  return {"offset": chunk_id, "content": [c.data for c in chunks]}
68
70
 
71
+ # TODO: this is very inefficient, this is meant for temporary debugging and will be removed in future releases
72
+ def _debug_log(self, data: Any):
73
+ if self.has_debug_log or not os.environ.get("WANDB_DEBUG_FILESTREAM_LOG"):
74
+ return
75
+
76
+ loaded = json.loads(data)
77
+ if not isinstance(loaded, dict):
78
+ return
79
+
80
+ # get key size and convert to MB
81
+ key_sizes = [(k, len(json.dumps(v))) for k, v in loaded.items()]
82
+ key_msg = [f"{k}: {v/1048576:.5f} MB" for k, v in key_sizes]
83
+ wandb.termerror(f"Step: {loaded['_step']} | {key_msg}", repeat=False)
84
+ self.has_debug_log = True
85
+
69
86
 
70
87
  class JsonlFilePolicy(DefaultFilePolicy):
71
88
  def process_chunks(self, chunks: List[Chunk]) -> "ProcessedChunk":
@@ -81,6 +98,7 @@ class JsonlFilePolicy(DefaultFilePolicy):
81
98
  )
82
99
  wandb.termerror(msg, repeat=False)
83
100
  wandb._sentry.message(msg, repeat=False)
101
+ self._debug_log(chunk.data)
84
102
  else:
85
103
  chunk_data.append(chunk.data)
86
104
 
@@ -99,6 +117,7 @@ class SummaryFilePolicy(DefaultFilePolicy):
99
117
  )
100
118
  wandb.termerror(msg, repeat=False)
101
119
  wandb._sentry.message(msg, repeat=False)
120
+ self._debug_log(data)
102
121
  return False
103
122
  return {"offset": 0, "content": [data]}
104
123
 
@@ -689,7 +689,7 @@ class HandleManager:
689
689
  self._settings, interface=self._interface, run_proto=run_start.run
690
690
  )
691
691
 
692
- if run_start.run.resumed:
692
+ if run_start.run.resumed or run_start.run.forked:
693
693
  self._step = run_start.run.starting_step
694
694
  result = proto_util._result_from_record(record)
695
695
  self._respond_result(result)
@@ -862,9 +862,6 @@ class HandleManager:
862
862
  self._respond_result(result)
863
863
  self._stopped.set()
864
864
 
865
- def handle_request_job_info(self, record: Record) -> None:
866
- self._dispatch_record(record, always_send=True)
867
-
868
865
  def finish(self) -> None:
869
866
  logger.info("shutting down handler")
870
867
  if self._system_monitor is not None:
@@ -2150,6 +2150,7 @@ class Api:
2150
2150
  name
2151
2151
  }
2152
2152
  }
2153
+ historyLineCount
2153
2154
  }
2154
2155
  inserted
2155
2156
  _Server_Settings_
@@ -2237,6 +2238,7 @@ class Api:
2237
2238
  .get("serverSettings", {})
2238
2239
  .get("serverMessages", [])
2239
2240
  )
2241
+
2240
2242
  return (
2241
2243
  response["upsertBucket"]["bucket"],
2242
2244
  response["upsertBucket"]["inserted"],
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import re
6
6
  import sys
7
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
7
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
8
8
 
9
9
  import wandb
10
10
  from wandb.sdk.artifacts.artifact import Artifact
@@ -28,6 +28,8 @@ FROZEN_REQUIREMENTS_FNAME = "requirements.frozen.txt"
28
28
  JOB_FNAME = "wandb-job.json"
29
29
  JOB_ARTIFACT_TYPE = "job"
30
30
 
31
+ LOG_LEVEL = Literal["log", "warn", "error"]
32
+
31
33
 
32
34
  class GitInfo(TypedDict):
33
35
  remote: str
@@ -89,8 +91,9 @@ class JobBuilder:
89
91
  _job_seq_id: Optional[str]
90
92
  _job_version_alias: Optional[str]
91
93
  _is_notebook_run: bool
94
+ _verbose: bool
92
95
 
93
- def __init__(self, settings: SettingsStatic):
96
+ def __init__(self, settings: SettingsStatic, verbose: bool = False):
94
97
  self._settings = settings
95
98
  self._metadatafile_path = None
96
99
  self._requirements_path = None
@@ -106,6 +109,7 @@ class JobBuilder:
106
109
  Literal["repo", "artifact", "image"]
107
110
  ] = settings.job_source # type: ignore[assignment]
108
111
  self._is_notebook_run = self._get_is_notebook_run()
112
+ self._verbose = verbose
109
113
 
110
114
  def set_config(self, config: Dict[str, Any]) -> None:
111
115
  self._config = config
@@ -197,6 +201,21 @@ class JobBuilder:
197
201
 
198
202
  return source, name
199
203
 
204
+ def _log_if_verbose(self, message: str, level: LOG_LEVEL) -> None:
205
+ log_func: Optional[Union[Callable[[Any], None], Callable[[Any], None]]] = None
206
+ if level == "log":
207
+ _logger.info(message)
208
+ log_func = wandb.termlog
209
+ elif level == "warn":
210
+ _logger.warning(message)
211
+ log_func = wandb.termwarn
212
+ elif level == "error":
213
+ _logger.error(message)
214
+ log_func = wandb.termerror
215
+
216
+ if self._verbose and log_func is not None:
217
+ log_func(message)
218
+
200
219
  def _build_artifact_job_source(
201
220
  self,
202
221
  program_relpath: str,
@@ -212,8 +231,9 @@ class JobBuilder:
212
231
  # at the directory the notebook is in instead of the jupyter core
213
232
  if not os.path.exists(os.path.basename(program_relpath)):
214
233
  _logger.info("target path does not exist, exiting")
215
- wandb.termwarn(
216
- "No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job"
234
+ self._log_if_verbose(
235
+ "No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job",
236
+ "warn",
217
237
  )
218
238
  return None, None
219
239
  full_program_relpath = os.path.basename(program_relpath)
@@ -299,22 +319,25 @@ class JobBuilder:
299
319
  if not os.path.exists(
300
320
  os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
301
321
  ):
302
- wandb.termwarn(
303
- "No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
322
+ self._log_if_verbose(
323
+ "No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
324
+ "warn",
304
325
  )
305
326
  return None
306
327
  metadata = self._handle_metadata_file()
307
328
  if metadata is None:
308
- wandb.termwarn(
309
- f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables"
329
+ self._log_if_verbose(
330
+ f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables",
331
+ "warn",
310
332
  )
311
333
  return None
312
334
 
313
335
  runtime: Optional[str] = metadata.get("python")
314
336
  # can't build a job without a python version
315
337
  if runtime is None:
316
- wandb.termwarn(
317
- "No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
338
+ self._log_if_verbose(
339
+ "No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
340
+ "warn",
318
341
  )
319
342
  return None
320
343
 
@@ -345,13 +368,16 @@ class JobBuilder:
345
368
  or self._settings.job_source
346
369
  or self._source_type
347
370
  ):
348
- wandb.termwarn("No source type found, not creating job artifact")
371
+ self._log_if_verbose(
372
+ "No source type found, not creating job artifact", "warn"
373
+ )
349
374
  return None
350
375
 
351
376
  program_relpath = self._get_program_relpath(source_type, metadata)
352
377
  if source_type != "image" and not program_relpath:
353
- wandb.termwarn(
354
- "No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
378
+ self._log_if_verbose(
379
+ "No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
380
+ "warn",
355
381
  )
356
382
  return None
357
383
 
@@ -377,10 +403,11 @@ class JobBuilder:
377
403
 
378
404
  if source is None:
379
405
  if source_type:
380
- wandb.termwarn(
406
+ self._log_if_verbose(
381
407
  f"Source type is set to '{source_type}' but some required information is missing "
382
408
  "from the environment. A job will not be created from this run. See "
383
- "https://docs.wandb.ai/guides/launch/create-job"
409
+ "https://docs.wandb.ai/guides/launch/create-job",
410
+ "warn",
384
411
  )
385
412
  return None
386
413
 
@@ -447,8 +474,9 @@ class JobBuilder:
447
474
  program = metadata.get("program")
448
475
 
449
476
  if not program:
450
- wandb.termwarn(
451
- "Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job"
477
+ self._log_if_verbose(
478
+ "Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job",
479
+ "warn",
452
480
  )
453
481
 
454
482
  return program
@@ -115,6 +115,7 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
115
115
  "ref": content.ref if content.ref else None,
116
116
  "size": content.size if content.size is not None else None,
117
117
  "local_path": content.local_path if content.local_path else None,
118
+ "skip_cache": content.skip_cache,
118
119
  "extra": {
119
120
  extra.key: json.loads(extra.value_json) for extra in content.extra
120
121
  },
@@ -733,18 +734,7 @@ class SendManager:
733
734
  )
734
735
  self._respond_result(result)
735
736
 
736
- def send_request_job_info(self, record: "Record") -> None:
737
- """Respond to a request for a job link."""
738
- result = proto_util._result_from_record(record)
739
- result.response.job_info_response.sequenceId = (
740
- self._job_builder._job_seq_id or ""
741
- )
742
- result.response.job_info_response.version = (
743
- self._job_builder._job_version_alias or ""
744
- )
745
- self._respond_result(result)
746
-
747
- def _maybe_setup_resume(
737
+ def _setup_resume(
748
738
  self, run: "RunRecord"
749
739
  ) -> Optional["wandb_internal_pb2.ErrorInfo"]:
750
740
  """Queries the backend for a run; fail if the settings are incompatible."""
@@ -890,6 +880,30 @@ class SendManager:
890
880
  pass
891
881
  # TODO: do something if sync spell is not successful?
892
882
 
883
+ def _setup_fork(self, server_run: dict):
884
+ assert self._settings.fork_from
885
+ assert self._settings.fork_from.metric == "_step"
886
+ assert self._run
887
+ first_step = int(self._settings.fork_from.value) + 1
888
+ self._resume_state.step = first_step
889
+ self._resume_state.history = server_run.get("historyLineCount", 0)
890
+ self._run.forked = True
891
+ self._run.starting_step = first_step
892
+
893
+ def _handle_error(
894
+ self,
895
+ record: "Record",
896
+ error: "wandb_internal_pb2.ErrorInfo",
897
+ run: "RunRecord",
898
+ ) -> None:
899
+ if record.control.req_resp or record.control.mailbox_slot:
900
+ result = proto_util._result_from_record(record)
901
+ result.run_result.run.CopyFrom(run)
902
+ result.run_result.error.CopyFrom(error)
903
+ self._respond_result(result)
904
+ else:
905
+ logger.error("Got error in async mode: %s", error.message)
906
+
893
907
  def send_run(self, record: "Record", file_dir: Optional[str] = None) -> None:
894
908
  run = record.run
895
909
  error = None
@@ -911,21 +925,28 @@ class SendManager:
911
925
  config_value_dict = self._config_backend_dict()
912
926
  self._config_save(config_value_dict)
913
927
 
928
+ do_fork = self._settings.fork_from is not None and is_wandb_init
929
+ do_resume = bool(self._settings.resume)
930
+
931
+ if do_fork and do_resume:
932
+ error = wandb_internal_pb2.ErrorInfo()
933
+ error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
934
+ error.message = (
935
+ "You cannot use `resume` and `fork_from` together. Please choose one."
936
+ )
937
+ self._handle_error(record, error, run)
938
+
914
939
  if is_wandb_init:
915
940
  # Ensure we have a project to query for status
916
941
  if run.project == "":
917
942
  run.project = util.auto_project_name(self._settings.program)
918
943
  # Only check resume status on `wandb.init`
919
- error = self._maybe_setup_resume(run)
944
+
945
+ if do_resume:
946
+ error = self._setup_resume(run)
920
947
 
921
948
  if error is not None:
922
- if record.control.req_resp or record.control.mailbox_slot:
923
- result = proto_util._result_from_record(record)
924
- result.run_result.run.CopyFrom(run)
925
- result.run_result.error.CopyFrom(error)
926
- self._respond_result(result)
927
- else:
928
- logger.error("Got error in async mode: %s", error.message)
949
+ self._handle_error(record, error, run)
929
950
  return
930
951
 
931
952
  # Save the resumed config
@@ -945,19 +966,22 @@ class SendManager:
945
966
  self._config_save(config_value_dict)
946
967
 
947
968
  try:
948
- self._init_run(run, config_value_dict)
969
+ server_run = self._init_run(run, config_value_dict)
949
970
  except (CommError, UsageError) as e:
950
971
  logger.error(e, exc_info=True)
951
- if record.control.req_resp or record.control.mailbox_slot:
952
- result = proto_util._result_from_record(record)
953
- result.run_result.run.CopyFrom(run)
954
- error = ProtobufErrorHandler.from_exception(e)
955
- result.run_result.error.CopyFrom(error)
956
- self._respond_result(result)
972
+ error = ProtobufErrorHandler.from_exception(e)
973
+ self._handle_error(record, error, run)
957
974
  return
958
975
 
959
976
  assert self._run # self._run is configured in _init_run()
960
977
 
978
+ if do_fork:
979
+ error = self._setup_fork(server_run)
980
+
981
+ if error is not None:
982
+ self._handle_error(record, error, run)
983
+ return
984
+
961
985
  if record.control.req_resp or record.control.mailbox_slot:
962
986
  result = proto_util._result_from_record(record)
963
987
  # TODO: we could do self._interface.publish_defer(resp) to notify
@@ -976,7 +1000,7 @@ class SendManager:
976
1000
  self,
977
1001
  run: "RunRecord",
978
1002
  config_dict: Optional[sender_config.BackendConfigDict],
979
- ) -> None:
1003
+ ) -> dict:
980
1004
  # We subtract the previous runs runtime when resuming
981
1005
  start_time = (
982
1006
  run.start_time.ToMicroseconds() / 1e6
@@ -1061,6 +1085,7 @@ class SendManager:
1061
1085
  self._run.sweep_id = sweep_id
1062
1086
  if os.getenv("SPELL_RUN_URL"):
1063
1087
  self._sync_spell()
1088
+ return server_run
1064
1089
 
1065
1090
  def _start_run_threads(self, file_dir: Optional[str] = None) -> None:
1066
1091
  assert self._run # self._run is configured by caller
@@ -2,6 +2,7 @@ from dataclasses import fields
2
2
  from typing import Any, Iterable, Sequence, Tuple
3
3
 
4
4
  from wandb.proto import wandb_settings_pb2
5
+ from wandb.sdk.lib import RunMoment
5
6
  from wandb.sdk.wandb_settings import SettingsData
6
7
 
7
8
 
@@ -38,6 +39,14 @@ class SettingsStatic(SettingsData):
38
39
  unpacked_inner[inner_key] = inner_value
39
40
  unpacked_mapping[outer_key] = unpacked_inner
40
41
  value = unpacked_mapping
42
+ elif key == "fork_from":
43
+ value = getattr(proto, key)
44
+ if value.run:
45
+ value = RunMoment(
46
+ run=value.run, value=value.value, metric=value.metric
47
+ )
48
+ else:
49
+ value = None
41
50
  else:
42
51
  if proto.HasField(key): # type: ignore [arg-type]
43
52
  value = getattr(proto, key).value
@@ -212,7 +212,10 @@ class SystemInfo:
212
212
  os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
213
213
  ) as f:
214
214
  subprocess.call(
215
- ["conda", "env", "export"], stdout=f, stderr=subprocess.DEVNULL
215
+ ["conda", "env", "export"],
216
+ stdout=f,
217
+ stderr=subprocess.DEVNULL,
218
+ timeout=15, # add timeout since conda env export could take a really long time
216
219
  )
217
220
  except Exception as e:
218
221
  logger.exception(f"Error saving conda packages: {e}")
@@ -62,6 +62,7 @@ def resolve_agent_config( # noqa: C901
62
62
  max_jobs: Optional[int],
63
63
  queues: Optional[Tuple[str]],
64
64
  config: Optional[str],
65
+ verbosity: Optional[int],
65
66
  ) -> Tuple[Dict[str, Any], Api]:
66
67
  """Resolve the agent config.
67
68
 
@@ -72,6 +73,7 @@ def resolve_agent_config( # noqa: C901
72
73
  max_jobs (int): The max number of jobs.
73
74
  queues (Tuple[str]): The queues.
74
75
  config (str): The config.
76
+ verbosity (int): How verbose to print, 0 or None = default, 1 = print status every 20 seconds, 2 = also print debugging information
75
77
 
76
78
  Returns:
77
79
  Tuple[Dict[str, Any], Api]: The resolved config and api.
@@ -83,6 +85,7 @@ def resolve_agent_config( # noqa: C901
83
85
  "queues": [],
84
86
  "registry": {},
85
87
  "builder": {},
88
+ "verbosity": 0,
86
89
  }
87
90
  user_set_project = False
88
91
  resolved_config: Dict[str, Any] = defaults
@@ -123,6 +126,8 @@ def resolve_agent_config( # noqa: C901
123
126
  resolved_config.update({"max_jobs": int(max_jobs)})
124
127
  if queues:
125
128
  resolved_config.update({"queues": list(queues)})
129
+ if verbosity:
130
+ resolved_config.update({"verbosity": int(verbosity)})
126
131
  # queue -> queues
127
132
  if resolved_config.get("queue"):
128
133
  if isinstance(resolved_config.get("queue"), str):
@@ -14,6 +14,7 @@ import wandb.docker as docker
14
14
  from wandb.apis.internal import Api
15
15
  from wandb.errors import CommError
16
16
  from wandb.sdk.launch import utils
17
+ from wandb.sdk.launch.utils import get_entrypoint_file
17
18
  from wandb.sdk.lib.runid import generate_id
18
19
 
19
20
  from .errors import LaunchError
@@ -135,7 +136,7 @@ class LaunchProject:
135
136
  if override_entrypoint:
136
137
  _logger.info("Adding override entry point")
137
138
  self.override_entrypoint = EntryPoint(
138
- name=_get_entrypoint_file(override_entrypoint),
139
+ name=get_entrypoint_file(override_entrypoint),
139
140
  command=override_entrypoint,
140
141
  )
141
142
 
@@ -536,24 +537,6 @@ class LaunchProject:
536
537
  self.git_version = branch_name
537
538
 
538
539
 
539
- def _get_entrypoint_file(entrypoint: List[str]) -> Optional[str]:
540
- """Get the entrypoint file from the given command.
541
-
542
- Args:
543
- entrypoint (List[str]): List of command and arguments.
544
-
545
- Returns:
546
- Optional[str]: The entrypoint file if found, otherwise None.
547
- """
548
- if not entrypoint:
549
- return None
550
- if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
551
- return entrypoint[0]
552
- if len(entrypoint) < 2:
553
- return None
554
- return entrypoint[1]
555
-
556
-
557
540
  class EntryPoint:
558
541
  """An entry point into a wandb launch specification."""
559
542
 
@@ -570,7 +553,9 @@ class EntryPoint:
570
553
 
571
554
  def update_entrypoint_path(self, new_path: str) -> None:
572
555
  """Updates the entrypoint path to a new path."""
573
- if len(self.command) == 2 and self.command[0] in ["python", "bash"]:
556
+ if len(self.command) == 2 and (
557
+ self.command[0].startswith("python") or self.command[0] == "bash"
558
+ ):
574
559
  self.command[1] = new_path
575
560
 
576
561