wandb 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. wandb/__init__.py +2 -2
  2. wandb/agents/pyagent.py +1 -1
  3. wandb/apis/public/api.py +6 -6
  4. wandb/apis/reports/v2/interface.py +4 -8
  5. wandb/apis/reports/v2/internal.py +12 -45
  6. wandb/cli/cli.py +29 -5
  7. wandb/integration/openai/fine_tuning.py +74 -37
  8. wandb/integration/ultralytics/callback.py +0 -1
  9. wandb/proto/v3/wandb_internal_pb2.py +332 -312
  10. wandb/proto/v3/wandb_settings_pb2.py +13 -3
  11. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  12. wandb/proto/v4/wandb_internal_pb2.py +316 -312
  13. wandb/proto/v4/wandb_settings_pb2.py +5 -3
  14. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  15. wandb/sdk/artifacts/artifact.py +92 -26
  16. wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
  17. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
  18. wandb/sdk/artifacts/artifact_saver.py +16 -36
  19. wandb/sdk/artifacts/storage_handler.py +2 -1
  20. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +13 -5
  21. wandb/sdk/interface/interface.py +60 -15
  22. wandb/sdk/interface/interface_shared.py +13 -7
  23. wandb/sdk/internal/file_stream.py +19 -0
  24. wandb/sdk/internal/handler.py +1 -4
  25. wandb/sdk/internal/internal_api.py +2 -0
  26. wandb/sdk/internal/job_builder.py +45 -17
  27. wandb/sdk/internal/sender.py +53 -28
  28. wandb/sdk/internal/settings_static.py +9 -0
  29. wandb/sdk/internal/system/system_info.py +4 -1
  30. wandb/sdk/launch/_launch.py +5 -0
  31. wandb/sdk/launch/_project_spec.py +5 -20
  32. wandb/sdk/launch/agent/agent.py +80 -37
  33. wandb/sdk/launch/agent/config.py +8 -0
  34. wandb/sdk/launch/builder/kaniko_builder.py +149 -134
  35. wandb/sdk/launch/create_job.py +44 -48
  36. wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
  37. wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
  38. wandb/sdk/launch/sweeps/scheduler.py +3 -1
  39. wandb/sdk/launch/utils.py +23 -5
  40. wandb/sdk/lib/__init__.py +2 -5
  41. wandb/sdk/lib/_settings_toposort_generated.py +2 -0
  42. wandb/sdk/lib/filesystem.py +11 -1
  43. wandb/sdk/lib/run_moment.py +78 -0
  44. wandb/sdk/service/streams.py +1 -6
  45. wandb/sdk/wandb_init.py +12 -7
  46. wandb/sdk/wandb_login.py +43 -26
  47. wandb/sdk/wandb_run.py +179 -94
  48. wandb/sdk/wandb_settings.py +55 -16
  49. wandb/testing/relay.py +5 -6
  50. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
  51. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/RECORD +55 -54
  52. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/WHEEL +1 -1
  53. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
  54. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
  55. {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
@@ -13,8 +13,19 @@ import os
13
13
  import sys
14
14
  import time
15
15
  from abc import abstractmethod
16
- from typing import TYPE_CHECKING, Any, Dict, Iterable, NewType, Optional, Tuple, Union
16
+ from typing import (
17
+ TYPE_CHECKING,
18
+ Any,
19
+ Dict,
20
+ Iterable,
21
+ List,
22
+ NewType,
23
+ Optional,
24
+ Tuple,
25
+ Union,
26
+ )
17
27
 
28
+ from wandb import termwarn
18
29
  from wandb.proto import wandb_internal_pb2 as pb
19
30
  from wandb.proto import wandb_telemetry_pb2 as tpb
20
31
  from wandb.sdk.artifacts.artifact import Artifact
@@ -340,6 +351,7 @@ class InterfaceBase:
340
351
  proto_entry.ref = entry.ref
341
352
  if entry.local_path:
342
353
  proto_entry.local_path = entry.local_path
354
+ proto_entry.skip_cache = entry.skip_cache
343
355
  for k, v in entry.extra.items():
344
356
  proto_extra = proto_entry.extra.add()
345
357
  proto_extra.key = k
@@ -436,16 +448,27 @@ class InterfaceBase:
436
448
  path = artifact.get_entry("wandb-job.json").download()
437
449
  with open(path) as f:
438
450
  job_info = json.load(f)
451
+
439
452
  except Exception as e:
440
453
  logger.warning(
441
454
  f"Failed to download partial job info from artifact {artifact}, : {e}"
442
455
  )
443
- use_artifact = self._make_proto_use_artifact(
444
- use_artifact=use_artifact,
445
- job_name=artifact.name,
446
- job_info=job_info,
447
- metadata=artifact.metadata,
448
- )
456
+ termwarn(
457
+ f"Failed to download partial job info from artifact {artifact}, : {e}"
458
+ )
459
+ return
460
+
461
+ try:
462
+ use_artifact = self._make_proto_use_artifact(
463
+ use_artifact=use_artifact,
464
+ job_name=artifact.name,
465
+ job_info=job_info,
466
+ metadata=artifact.metadata,
467
+ )
468
+ except Exception as e:
469
+ logger.warning(f"Failed to construct use artifact proto: {e}")
470
+ termwarn(f"Failed to construct use artifact proto: {e}")
471
+ return
449
472
 
450
473
  self._publish_use_artifact(use_artifact)
451
474
 
@@ -756,6 +779,36 @@ class InterfaceBase:
756
779
  run_start.run.CopyFrom(run_pb)
757
780
  return self._deliver_run_start(run_start)
758
781
 
782
+ def publish_launch_wandb_config_parameters(
783
+ self, include_paths: List[List[str]], exclude_paths: List[List[str]]
784
+ ):
785
+ """Tells the internal process to treat wandb.config fields as job inputs.
786
+
787
+ The paths provided as arguments are sequences of dictionary keys that
788
+ specify a path within the wandb.config. If a path is included, the
789
+ corresponding field will be treated as a job input. If a path is
790
+ excluded, the corresponding field will not be treated as a job input.
791
+
792
+ Args:
793
+ include_paths: paths within config to include as job inputs.
794
+ exclude_paths: paths within config to exclude as job inputs.
795
+
796
+ Returns:
797
+ None
798
+ """
799
+ config_parameters = pb.LaunchWandbConfigParametersRecord()
800
+ include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
801
+ exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
802
+ config_parameters.include_paths.extend(include_records)
803
+ config_parameters.exclude_paths.extend(exclude_records)
804
+ return self._publish_launch_wandb_config_parameters(config_parameters)
805
+
806
+ @abstractmethod
807
+ def _publish_launch_wandb_config_parameters(
808
+ self, config_parameters: pb.LaunchWandbConfigParametersRecord
809
+ ) -> None:
810
+ raise NotImplementedError
811
+
759
812
  @abstractmethod
760
813
  def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
761
814
  raise NotImplementedError
@@ -871,11 +924,3 @@ class InterfaceBase:
871
924
  self, run_status: pb.RunStatusRequest
872
925
  ) -> MailboxHandle:
873
926
  raise NotImplementedError
874
-
875
- def deliver_request_job_info(self) -> MailboxHandle:
876
- job_info = pb.JobInfoRequest()
877
- return self._deliver_request_job_info(job_info)
878
-
879
- @abstractmethod
880
- def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
881
- raise NotImplementedError
@@ -145,7 +145,6 @@ class InterfaceShared(InterfaceBase):
145
145
  cancel: Optional[pb.CancelRequest] = None,
146
146
  summary_record: Optional[pb.SummaryRecordRequest] = None,
147
147
  telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
148
- job_info: Optional[pb.JobInfoRequest] = None,
149
148
  get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
150
149
  python_packages: Optional[pb.PythonPackagesRequest] = None,
151
150
  ) -> pb.Record:
@@ -202,8 +201,6 @@ class InterfaceShared(InterfaceBase):
202
201
  request.summary_record.CopyFrom(summary_record)
203
202
  elif telemetry_record:
204
203
  request.telemetry_record.CopyFrom(telemetry_record)
205
- elif job_info:
206
- request.job_info.CopyFrom(job_info)
207
204
  elif get_system_metrics:
208
205
  request.get_system_metrics.CopyFrom(get_system_metrics)
209
206
  elif sync:
@@ -242,6 +239,9 @@ class InterfaceShared(InterfaceBase):
242
239
  use_artifact: Optional[pb.UseArtifactRecord] = None,
243
240
  output: Optional[pb.OutputRecord] = None,
244
241
  output_raw: Optional[pb.OutputRawRecord] = None,
242
+ launch_wandb_config_parameters: Optional[
243
+ pb.LaunchWandbConfigParametersRecord
244
+ ] = None,
245
245
  ) -> pb.Record:
246
246
  record = pb.Record()
247
247
  if run:
@@ -286,6 +286,8 @@ class InterfaceShared(InterfaceBase):
286
286
  record.output.CopyFrom(output)
287
287
  elif output_raw:
288
288
  record.output_raw.CopyFrom(output_raw)
289
+ elif launch_wandb_config_parameters:
290
+ record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
289
291
  else:
290
292
  raise Exception("Invalid record")
291
293
  return record
@@ -415,6 +417,14 @@ class InterfaceShared(InterfaceBase):
415
417
  rec = self._make_record(alert=proto_alert)
416
418
  self._publish(rec)
417
419
 
420
+ def _publish_launch_wandb_config_parameters(
421
+ self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
422
+ ) -> None:
423
+ rec = self._make_record(
424
+ launch_wandb_config_parameters=launch_wandb_config_parameters
425
+ )
426
+ self._publish(rec)
427
+
418
428
  def _communicate_status(
419
429
  self, status: pb.StatusRequest
420
430
  ) -> Optional[pb.StatusResponse]:
@@ -523,10 +533,6 @@ class InterfaceShared(InterfaceBase):
523
533
  record = self._make_request(run_status=run_status)
524
534
  return self._deliver_record(record)
525
535
 
526
- def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
527
- record = self._make_request(job_info=job_info)
528
- return self._deliver_record(record)
529
-
530
536
  def _transport_keepalive_failed(self, keepalive_interval: int = 5) -> bool:
531
537
  if self._transport_failed:
532
538
  return True
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import functools
3
3
  import itertools
4
+ import json
4
5
  import logging
5
6
  import os
6
7
  import queue
@@ -58,6 +59,7 @@ class Chunk(NamedTuple):
58
59
  class DefaultFilePolicy:
59
60
  def __init__(self, start_chunk_id: int = 0) -> None:
60
61
  self._chunk_id = start_chunk_id
62
+ self.has_debug_log = False
61
63
 
62
64
  def process_chunks(
63
65
  self, chunks: List[Chunk]
@@ -66,6 +68,21 @@ class DefaultFilePolicy:
66
68
  self._chunk_id += len(chunks)
67
69
  return {"offset": chunk_id, "content": [c.data for c in chunks]}
68
70
 
71
+ # TODO: this is very inefficient, this is meant for temporary debugging and will be removed in future releases
72
+ def _debug_log(self, data: Any):
73
+ if self.has_debug_log or not os.environ.get("WANDB_DEBUG_FILESTREAM_LOG"):
74
+ return
75
+
76
+ loaded = json.loads(data)
77
+ if not isinstance(loaded, dict):
78
+ return
79
+
80
+ # get key size and convert to MB
81
+ key_sizes = [(k, len(json.dumps(v))) for k, v in loaded.items()]
82
+ key_msg = [f"{k}: {v/1048576:.5f} MB" for k, v in key_sizes]
83
+ wandb.termerror(f"Step: {loaded['_step']} | {key_msg}", repeat=False)
84
+ self.has_debug_log = True
85
+
69
86
 
70
87
  class JsonlFilePolicy(DefaultFilePolicy):
71
88
  def process_chunks(self, chunks: List[Chunk]) -> "ProcessedChunk":
@@ -81,6 +98,7 @@ class JsonlFilePolicy(DefaultFilePolicy):
81
98
  )
82
99
  wandb.termerror(msg, repeat=False)
83
100
  wandb._sentry.message(msg, repeat=False)
101
+ self._debug_log(chunk.data)
84
102
  else:
85
103
  chunk_data.append(chunk.data)
86
104
 
@@ -99,6 +117,7 @@ class SummaryFilePolicy(DefaultFilePolicy):
99
117
  )
100
118
  wandb.termerror(msg, repeat=False)
101
119
  wandb._sentry.message(msg, repeat=False)
120
+ self._debug_log(data)
102
121
  return False
103
122
  return {"offset": 0, "content": [data]}
104
123
 
@@ -689,7 +689,7 @@ class HandleManager:
689
689
  self._settings, interface=self._interface, run_proto=run_start.run
690
690
  )
691
691
 
692
- if run_start.run.resumed:
692
+ if run_start.run.resumed or run_start.run.forked:
693
693
  self._step = run_start.run.starting_step
694
694
  result = proto_util._result_from_record(record)
695
695
  self._respond_result(result)
@@ -862,9 +862,6 @@ class HandleManager:
862
862
  self._respond_result(result)
863
863
  self._stopped.set()
864
864
 
865
- def handle_request_job_info(self, record: Record) -> None:
866
- self._dispatch_record(record, always_send=True)
867
-
868
865
  def finish(self) -> None:
869
866
  logger.info("shutting down handler")
870
867
  if self._system_monitor is not None:
@@ -2150,6 +2150,7 @@ class Api:
2150
2150
  name
2151
2151
  }
2152
2152
  }
2153
+ historyLineCount
2153
2154
  }
2154
2155
  inserted
2155
2156
  _Server_Settings_
@@ -2237,6 +2238,7 @@ class Api:
2237
2238
  .get("serverSettings", {})
2238
2239
  .get("serverMessages", [])
2239
2240
  )
2241
+
2240
2242
  return (
2241
2243
  response["upsertBucket"]["bucket"],
2242
2244
  response["upsertBucket"]["inserted"],
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import re
6
6
  import sys
7
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
7
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
8
8
 
9
9
  import wandb
10
10
  from wandb.sdk.artifacts.artifact import Artifact
@@ -28,6 +28,8 @@ FROZEN_REQUIREMENTS_FNAME = "requirements.frozen.txt"
28
28
  JOB_FNAME = "wandb-job.json"
29
29
  JOB_ARTIFACT_TYPE = "job"
30
30
 
31
+ LOG_LEVEL = Literal["log", "warn", "error"]
32
+
31
33
 
32
34
  class GitInfo(TypedDict):
33
35
  remote: str
@@ -89,8 +91,9 @@ class JobBuilder:
89
91
  _job_seq_id: Optional[str]
90
92
  _job_version_alias: Optional[str]
91
93
  _is_notebook_run: bool
94
+ _verbose: bool
92
95
 
93
- def __init__(self, settings: SettingsStatic):
96
+ def __init__(self, settings: SettingsStatic, verbose: bool = False):
94
97
  self._settings = settings
95
98
  self._metadatafile_path = None
96
99
  self._requirements_path = None
@@ -106,6 +109,7 @@ class JobBuilder:
106
109
  Literal["repo", "artifact", "image"]
107
110
  ] = settings.job_source # type: ignore[assignment]
108
111
  self._is_notebook_run = self._get_is_notebook_run()
112
+ self._verbose = verbose
109
113
 
110
114
  def set_config(self, config: Dict[str, Any]) -> None:
111
115
  self._config = config
@@ -197,6 +201,21 @@ class JobBuilder:
197
201
 
198
202
  return source, name
199
203
 
204
+ def _log_if_verbose(self, message: str, level: LOG_LEVEL) -> None:
205
+ log_func: Optional[Union[Callable[[Any], None], Callable[[Any], None]]] = None
206
+ if level == "log":
207
+ _logger.info(message)
208
+ log_func = wandb.termlog
209
+ elif level == "warn":
210
+ _logger.warning(message)
211
+ log_func = wandb.termwarn
212
+ elif level == "error":
213
+ _logger.error(message)
214
+ log_func = wandb.termerror
215
+
216
+ if self._verbose and log_func is not None:
217
+ log_func(message)
218
+
200
219
  def _build_artifact_job_source(
201
220
  self,
202
221
  program_relpath: str,
@@ -212,8 +231,9 @@ class JobBuilder:
212
231
  # at the directory the notebook is in instead of the jupyter core
213
232
  if not os.path.exists(os.path.basename(program_relpath)):
214
233
  _logger.info("target path does not exist, exiting")
215
- wandb.termwarn(
216
- "No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job"
234
+ self._log_if_verbose(
235
+ "No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job",
236
+ "warn",
217
237
  )
218
238
  return None, None
219
239
  full_program_relpath = os.path.basename(program_relpath)
@@ -299,22 +319,25 @@ class JobBuilder:
299
319
  if not os.path.exists(
300
320
  os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
301
321
  ):
302
- wandb.termwarn(
303
- "No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
322
+ self._log_if_verbose(
323
+ "No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
324
+ "warn",
304
325
  )
305
326
  return None
306
327
  metadata = self._handle_metadata_file()
307
328
  if metadata is None:
308
- wandb.termwarn(
309
- f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables"
329
+ self._log_if_verbose(
330
+ f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables",
331
+ "warn",
310
332
  )
311
333
  return None
312
334
 
313
335
  runtime: Optional[str] = metadata.get("python")
314
336
  # can't build a job without a python version
315
337
  if runtime is None:
316
- wandb.termwarn(
317
- "No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
338
+ self._log_if_verbose(
339
+ "No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
340
+ "warn",
318
341
  )
319
342
  return None
320
343
 
@@ -345,13 +368,16 @@ class JobBuilder:
345
368
  or self._settings.job_source
346
369
  or self._source_type
347
370
  ):
348
- wandb.termwarn("No source type found, not creating job artifact")
371
+ self._log_if_verbose(
372
+ "No source type found, not creating job artifact", "warn"
373
+ )
349
374
  return None
350
375
 
351
376
  program_relpath = self._get_program_relpath(source_type, metadata)
352
377
  if source_type != "image" and not program_relpath:
353
- wandb.termwarn(
354
- "No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
378
+ self._log_if_verbose(
379
+ "No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
380
+ "warn",
355
381
  )
356
382
  return None
357
383
 
@@ -377,10 +403,11 @@ class JobBuilder:
377
403
 
378
404
  if source is None:
379
405
  if source_type:
380
- wandb.termwarn(
406
+ self._log_if_verbose(
381
407
  f"Source type is set to '{source_type}' but some required information is missing "
382
408
  "from the environment. A job will not be created from this run. See "
383
- "https://docs.wandb.ai/guides/launch/create-job"
409
+ "https://docs.wandb.ai/guides/launch/create-job",
410
+ "warn",
384
411
  )
385
412
  return None
386
413
 
@@ -447,8 +474,9 @@ class JobBuilder:
447
474
  program = metadata.get("program")
448
475
 
449
476
  if not program:
450
- wandb.termwarn(
451
- "Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job"
477
+ self._log_if_verbose(
478
+ "Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job",
479
+ "warn",
452
480
  )
453
481
 
454
482
  return program
@@ -115,6 +115,7 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
115
115
  "ref": content.ref if content.ref else None,
116
116
  "size": content.size if content.size is not None else None,
117
117
  "local_path": content.local_path if content.local_path else None,
118
+ "skip_cache": content.skip_cache,
118
119
  "extra": {
119
120
  extra.key: json.loads(extra.value_json) for extra in content.extra
120
121
  },
@@ -733,18 +734,7 @@ class SendManager:
733
734
  )
734
735
  self._respond_result(result)
735
736
 
736
- def send_request_job_info(self, record: "Record") -> None:
737
- """Respond to a request for a job link."""
738
- result = proto_util._result_from_record(record)
739
- result.response.job_info_response.sequenceId = (
740
- self._job_builder._job_seq_id or ""
741
- )
742
- result.response.job_info_response.version = (
743
- self._job_builder._job_version_alias or ""
744
- )
745
- self._respond_result(result)
746
-
747
- def _maybe_setup_resume(
737
+ def _setup_resume(
748
738
  self, run: "RunRecord"
749
739
  ) -> Optional["wandb_internal_pb2.ErrorInfo"]:
750
740
  """Queries the backend for a run; fail if the settings are incompatible."""
@@ -890,6 +880,30 @@ class SendManager:
890
880
  pass
891
881
  # TODO: do something if sync spell is not successful?
892
882
 
883
+ def _setup_fork(self, server_run: dict):
884
+ assert self._settings.fork_from
885
+ assert self._settings.fork_from.metric == "_step"
886
+ assert self._run
887
+ first_step = int(self._settings.fork_from.value) + 1
888
+ self._resume_state.step = first_step
889
+ self._resume_state.history = server_run.get("historyLineCount", 0)
890
+ self._run.forked = True
891
+ self._run.starting_step = first_step
892
+
893
+ def _handle_error(
894
+ self,
895
+ record: "Record",
896
+ error: "wandb_internal_pb2.ErrorInfo",
897
+ run: "RunRecord",
898
+ ) -> None:
899
+ if record.control.req_resp or record.control.mailbox_slot:
900
+ result = proto_util._result_from_record(record)
901
+ result.run_result.run.CopyFrom(run)
902
+ result.run_result.error.CopyFrom(error)
903
+ self._respond_result(result)
904
+ else:
905
+ logger.error("Got error in async mode: %s", error.message)
906
+
893
907
  def send_run(self, record: "Record", file_dir: Optional[str] = None) -> None:
894
908
  run = record.run
895
909
  error = None
@@ -911,21 +925,28 @@ class SendManager:
911
925
  config_value_dict = self._config_backend_dict()
912
926
  self._config_save(config_value_dict)
913
927
 
928
+ do_fork = self._settings.fork_from is not None and is_wandb_init
929
+ do_resume = bool(self._settings.resume)
930
+
931
+ if do_fork and do_resume:
932
+ error = wandb_internal_pb2.ErrorInfo()
933
+ error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
934
+ error.message = (
935
+ "You cannot use `resume` and `fork_from` together. Please choose one."
936
+ )
937
+ self._handle_error(record, error, run)
938
+
914
939
  if is_wandb_init:
915
940
  # Ensure we have a project to query for status
916
941
  if run.project == "":
917
942
  run.project = util.auto_project_name(self._settings.program)
918
943
  # Only check resume status on `wandb.init`
919
- error = self._maybe_setup_resume(run)
944
+
945
+ if do_resume:
946
+ error = self._setup_resume(run)
920
947
 
921
948
  if error is not None:
922
- if record.control.req_resp or record.control.mailbox_slot:
923
- result = proto_util._result_from_record(record)
924
- result.run_result.run.CopyFrom(run)
925
- result.run_result.error.CopyFrom(error)
926
- self._respond_result(result)
927
- else:
928
- logger.error("Got error in async mode: %s", error.message)
949
+ self._handle_error(record, error, run)
929
950
  return
930
951
 
931
952
  # Save the resumed config
@@ -945,19 +966,22 @@ class SendManager:
945
966
  self._config_save(config_value_dict)
946
967
 
947
968
  try:
948
- self._init_run(run, config_value_dict)
969
+ server_run = self._init_run(run, config_value_dict)
949
970
  except (CommError, UsageError) as e:
950
971
  logger.error(e, exc_info=True)
951
- if record.control.req_resp or record.control.mailbox_slot:
952
- result = proto_util._result_from_record(record)
953
- result.run_result.run.CopyFrom(run)
954
- error = ProtobufErrorHandler.from_exception(e)
955
- result.run_result.error.CopyFrom(error)
956
- self._respond_result(result)
972
+ error = ProtobufErrorHandler.from_exception(e)
973
+ self._handle_error(record, error, run)
957
974
  return
958
975
 
959
976
  assert self._run # self._run is configured in _init_run()
960
977
 
978
+ if do_fork:
979
+ error = self._setup_fork(server_run)
980
+
981
+ if error is not None:
982
+ self._handle_error(record, error, run)
983
+ return
984
+
961
985
  if record.control.req_resp or record.control.mailbox_slot:
962
986
  result = proto_util._result_from_record(record)
963
987
  # TODO: we could do self._interface.publish_defer(resp) to notify
@@ -976,7 +1000,7 @@ class SendManager:
976
1000
  self,
977
1001
  run: "RunRecord",
978
1002
  config_dict: Optional[sender_config.BackendConfigDict],
979
- ) -> None:
1003
+ ) -> dict:
980
1004
  # We subtract the previous runs runtime when resuming
981
1005
  start_time = (
982
1006
  run.start_time.ToMicroseconds() / 1e6
@@ -1061,6 +1085,7 @@ class SendManager:
1061
1085
  self._run.sweep_id = sweep_id
1062
1086
  if os.getenv("SPELL_RUN_URL"):
1063
1087
  self._sync_spell()
1088
+ return server_run
1064
1089
 
1065
1090
  def _start_run_threads(self, file_dir: Optional[str] = None) -> None:
1066
1091
  assert self._run # self._run is configured by caller
@@ -2,6 +2,7 @@ from dataclasses import fields
2
2
  from typing import Any, Iterable, Sequence, Tuple
3
3
 
4
4
  from wandb.proto import wandb_settings_pb2
5
+ from wandb.sdk.lib import RunMoment
5
6
  from wandb.sdk.wandb_settings import SettingsData
6
7
 
7
8
 
@@ -38,6 +39,14 @@ class SettingsStatic(SettingsData):
38
39
  unpacked_inner[inner_key] = inner_value
39
40
  unpacked_mapping[outer_key] = unpacked_inner
40
41
  value = unpacked_mapping
42
+ elif key == "fork_from":
43
+ value = getattr(proto, key)
44
+ if value.run:
45
+ value = RunMoment(
46
+ run=value.run, value=value.value, metric=value.metric
47
+ )
48
+ else:
49
+ value = None
41
50
  else:
42
51
  if proto.HasField(key): # type: ignore [arg-type]
43
52
  value = getattr(proto, key).value
@@ -212,7 +212,10 @@ class SystemInfo:
212
212
  os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
213
213
  ) as f:
214
214
  subprocess.call(
215
- ["conda", "env", "export"], stdout=f, stderr=subprocess.DEVNULL
215
+ ["conda", "env", "export"],
216
+ stdout=f,
217
+ stderr=subprocess.DEVNULL,
218
+ timeout=15, # add timeout since conda env export could take a really long time
216
219
  )
217
220
  except Exception as e:
218
221
  logger.exception(f"Error saving conda packages: {e}")
@@ -62,6 +62,7 @@ def resolve_agent_config( # noqa: C901
62
62
  max_jobs: Optional[int],
63
63
  queues: Optional[Tuple[str]],
64
64
  config: Optional[str],
65
+ verbosity: Optional[int],
65
66
  ) -> Tuple[Dict[str, Any], Api]:
66
67
  """Resolve the agent config.
67
68
 
@@ -72,6 +73,7 @@ def resolve_agent_config( # noqa: C901
72
73
  max_jobs (int): The max number of jobs.
73
74
  queues (Tuple[str]): The queues.
74
75
  config (str): The config.
76
+ verbosity (int): How verbose to print, 0 or None = default, 1 = print status every 20 seconds, 2 = also print debugging information
75
77
 
76
78
  Returns:
77
79
  Tuple[Dict[str, Any], Api]: The resolved config and api.
@@ -83,6 +85,7 @@ def resolve_agent_config( # noqa: C901
83
85
  "queues": [],
84
86
  "registry": {},
85
87
  "builder": {},
88
+ "verbosity": 0,
86
89
  }
87
90
  user_set_project = False
88
91
  resolved_config: Dict[str, Any] = defaults
@@ -123,6 +126,8 @@ def resolve_agent_config( # noqa: C901
123
126
  resolved_config.update({"max_jobs": int(max_jobs)})
124
127
  if queues:
125
128
  resolved_config.update({"queues": list(queues)})
129
+ if verbosity:
130
+ resolved_config.update({"verbosity": int(verbosity)})
126
131
  # queue -> queues
127
132
  if resolved_config.get("queue"):
128
133
  if isinstance(resolved_config.get("queue"), str):
@@ -14,6 +14,7 @@ import wandb.docker as docker
14
14
  from wandb.apis.internal import Api
15
15
  from wandb.errors import CommError
16
16
  from wandb.sdk.launch import utils
17
+ from wandb.sdk.launch.utils import get_entrypoint_file
17
18
  from wandb.sdk.lib.runid import generate_id
18
19
 
19
20
  from .errors import LaunchError
@@ -135,7 +136,7 @@ class LaunchProject:
135
136
  if override_entrypoint:
136
137
  _logger.info("Adding override entry point")
137
138
  self.override_entrypoint = EntryPoint(
138
- name=_get_entrypoint_file(override_entrypoint),
139
+ name=get_entrypoint_file(override_entrypoint),
139
140
  command=override_entrypoint,
140
141
  )
141
142
 
@@ -536,24 +537,6 @@ class LaunchProject:
536
537
  self.git_version = branch_name
537
538
 
538
539
 
539
- def _get_entrypoint_file(entrypoint: List[str]) -> Optional[str]:
540
- """Get the entrypoint file from the given command.
541
-
542
- Args:
543
- entrypoint (List[str]): List of command and arguments.
544
-
545
- Returns:
546
- Optional[str]: The entrypoint file if found, otherwise None.
547
- """
548
- if not entrypoint:
549
- return None
550
- if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
551
- return entrypoint[0]
552
- if len(entrypoint) < 2:
553
- return None
554
- return entrypoint[1]
555
-
556
-
557
540
  class EntryPoint:
558
541
  """An entry point into a wandb launch specification."""
559
542
 
@@ -570,7 +553,9 @@ class EntryPoint:
570
553
 
571
554
  def update_entrypoint_path(self, new_path: str) -> None:
572
555
  """Updates the entrypoint path to a new path."""
573
- if len(self.command) == 2 and self.command[0] in ["python", "bash"]:
556
+ if len(self.command) == 2 and (
557
+ self.command[0].startswith("python") or self.command[0] == "bash"
558
+ ):
574
559
  self.command[1] = new_path
575
560
 
576
561