wandb 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/public/api.py +6 -6
- wandb/apis/reports/v2/interface.py +4 -8
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +29 -5
- wandb/integration/openai/fine_tuning.py +74 -37
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +92 -26
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
- wandb/sdk/artifacts/artifact_saver.py +16 -36
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +13 -5
- wandb/sdk/interface/interface.py +60 -15
- wandb/sdk/interface/interface_shared.py +13 -7
- wandb/sdk/internal/file_stream.py +19 -0
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +2 -0
- wandb/sdk/internal/job_builder.py +45 -17
- wandb/sdk/internal/sender.py +53 -28
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +4 -1
- wandb/sdk/launch/_launch.py +5 -0
- wandb/sdk/launch/_project_spec.py +5 -20
- wandb/sdk/launch/agent/agent.py +80 -37
- wandb/sdk/launch/agent/config.py +8 -0
- wandb/sdk/launch/builder/kaniko_builder.py +149 -134
- wandb/sdk/launch/create_job.py +44 -48
- wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/sweeps/scheduler.py +3 -1
- wandb/sdk/launch/utils.py +23 -5
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +2 -0
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +78 -0
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/wandb_init.py +12 -7
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +179 -94
- wandb/sdk/wandb_settings.py +55 -16
- wandb/testing/relay.py +5 -6
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/RECORD +55 -54
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/WHEEL +1 -1
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
wandb/sdk/interface/interface.py
CHANGED
@@ -13,8 +13,19 @@ import os
|
|
13
13
|
import sys
|
14
14
|
import time
|
15
15
|
from abc import abstractmethod
|
16
|
-
from typing import
|
16
|
+
from typing import (
|
17
|
+
TYPE_CHECKING,
|
18
|
+
Any,
|
19
|
+
Dict,
|
20
|
+
Iterable,
|
21
|
+
List,
|
22
|
+
NewType,
|
23
|
+
Optional,
|
24
|
+
Tuple,
|
25
|
+
Union,
|
26
|
+
)
|
17
27
|
|
28
|
+
from wandb import termwarn
|
18
29
|
from wandb.proto import wandb_internal_pb2 as pb
|
19
30
|
from wandb.proto import wandb_telemetry_pb2 as tpb
|
20
31
|
from wandb.sdk.artifacts.artifact import Artifact
|
@@ -340,6 +351,7 @@ class InterfaceBase:
|
|
340
351
|
proto_entry.ref = entry.ref
|
341
352
|
if entry.local_path:
|
342
353
|
proto_entry.local_path = entry.local_path
|
354
|
+
proto_entry.skip_cache = entry.skip_cache
|
343
355
|
for k, v in entry.extra.items():
|
344
356
|
proto_extra = proto_entry.extra.add()
|
345
357
|
proto_extra.key = k
|
@@ -436,16 +448,27 @@ class InterfaceBase:
|
|
436
448
|
path = artifact.get_entry("wandb-job.json").download()
|
437
449
|
with open(path) as f:
|
438
450
|
job_info = json.load(f)
|
451
|
+
|
439
452
|
except Exception as e:
|
440
453
|
logger.warning(
|
441
454
|
f"Failed to download partial job info from artifact {artifact}, : {e}"
|
442
455
|
)
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
456
|
+
termwarn(
|
457
|
+
f"Failed to download partial job info from artifact {artifact}, : {e}"
|
458
|
+
)
|
459
|
+
return
|
460
|
+
|
461
|
+
try:
|
462
|
+
use_artifact = self._make_proto_use_artifact(
|
463
|
+
use_artifact=use_artifact,
|
464
|
+
job_name=artifact.name,
|
465
|
+
job_info=job_info,
|
466
|
+
metadata=artifact.metadata,
|
467
|
+
)
|
468
|
+
except Exception as e:
|
469
|
+
logger.warning(f"Failed to construct use artifact proto: {e}")
|
470
|
+
termwarn(f"Failed to construct use artifact proto: {e}")
|
471
|
+
return
|
449
472
|
|
450
473
|
self._publish_use_artifact(use_artifact)
|
451
474
|
|
@@ -756,6 +779,36 @@ class InterfaceBase:
|
|
756
779
|
run_start.run.CopyFrom(run_pb)
|
757
780
|
return self._deliver_run_start(run_start)
|
758
781
|
|
782
|
+
def publish_launch_wandb_config_parameters(
|
783
|
+
self, include_paths: List[List[str]], exclude_paths: List[List[str]]
|
784
|
+
):
|
785
|
+
"""Tells the internal process to treat wandb.config fields as job inputs.
|
786
|
+
|
787
|
+
The paths provided as arguments are sequences of dictionary keys that
|
788
|
+
specify a path within the wandb.config. If a path is included, the
|
789
|
+
corresponding field will be treated as a job input. If a path is
|
790
|
+
excluded, the corresponding field will not be treated as a job input.
|
791
|
+
|
792
|
+
Args:
|
793
|
+
include_paths: paths within config to include as job inputs.
|
794
|
+
exclude_paths: paths within config to exclude as job inputs.
|
795
|
+
|
796
|
+
Returns:
|
797
|
+
None
|
798
|
+
"""
|
799
|
+
config_parameters = pb.LaunchWandbConfigParametersRecord()
|
800
|
+
include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
|
801
|
+
exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
|
802
|
+
config_parameters.include_paths.extend(include_records)
|
803
|
+
config_parameters.exclude_paths.extend(exclude_records)
|
804
|
+
return self._publish_launch_wandb_config_parameters(config_parameters)
|
805
|
+
|
806
|
+
@abstractmethod
|
807
|
+
def _publish_launch_wandb_config_parameters(
|
808
|
+
self, config_parameters: pb.LaunchWandbConfigParametersRecord
|
809
|
+
) -> None:
|
810
|
+
raise NotImplementedError
|
811
|
+
|
759
812
|
@abstractmethod
|
760
813
|
def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
|
761
814
|
raise NotImplementedError
|
@@ -871,11 +924,3 @@ class InterfaceBase:
|
|
871
924
|
self, run_status: pb.RunStatusRequest
|
872
925
|
) -> MailboxHandle:
|
873
926
|
raise NotImplementedError
|
874
|
-
|
875
|
-
def deliver_request_job_info(self) -> MailboxHandle:
|
876
|
-
job_info = pb.JobInfoRequest()
|
877
|
-
return self._deliver_request_job_info(job_info)
|
878
|
-
|
879
|
-
@abstractmethod
|
880
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
881
|
-
raise NotImplementedError
|
@@ -145,7 +145,6 @@ class InterfaceShared(InterfaceBase):
|
|
145
145
|
cancel: Optional[pb.CancelRequest] = None,
|
146
146
|
summary_record: Optional[pb.SummaryRecordRequest] = None,
|
147
147
|
telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
|
148
|
-
job_info: Optional[pb.JobInfoRequest] = None,
|
149
148
|
get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
|
150
149
|
python_packages: Optional[pb.PythonPackagesRequest] = None,
|
151
150
|
) -> pb.Record:
|
@@ -202,8 +201,6 @@ class InterfaceShared(InterfaceBase):
|
|
202
201
|
request.summary_record.CopyFrom(summary_record)
|
203
202
|
elif telemetry_record:
|
204
203
|
request.telemetry_record.CopyFrom(telemetry_record)
|
205
|
-
elif job_info:
|
206
|
-
request.job_info.CopyFrom(job_info)
|
207
204
|
elif get_system_metrics:
|
208
205
|
request.get_system_metrics.CopyFrom(get_system_metrics)
|
209
206
|
elif sync:
|
@@ -242,6 +239,9 @@ class InterfaceShared(InterfaceBase):
|
|
242
239
|
use_artifact: Optional[pb.UseArtifactRecord] = None,
|
243
240
|
output: Optional[pb.OutputRecord] = None,
|
244
241
|
output_raw: Optional[pb.OutputRawRecord] = None,
|
242
|
+
launch_wandb_config_parameters: Optional[
|
243
|
+
pb.LaunchWandbConfigParametersRecord
|
244
|
+
] = None,
|
245
245
|
) -> pb.Record:
|
246
246
|
record = pb.Record()
|
247
247
|
if run:
|
@@ -286,6 +286,8 @@ class InterfaceShared(InterfaceBase):
|
|
286
286
|
record.output.CopyFrom(output)
|
287
287
|
elif output_raw:
|
288
288
|
record.output_raw.CopyFrom(output_raw)
|
289
|
+
elif launch_wandb_config_parameters:
|
290
|
+
record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
|
289
291
|
else:
|
290
292
|
raise Exception("Invalid record")
|
291
293
|
return record
|
@@ -415,6 +417,14 @@ class InterfaceShared(InterfaceBase):
|
|
415
417
|
rec = self._make_record(alert=proto_alert)
|
416
418
|
self._publish(rec)
|
417
419
|
|
420
|
+
def _publish_launch_wandb_config_parameters(
|
421
|
+
self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
|
422
|
+
) -> None:
|
423
|
+
rec = self._make_record(
|
424
|
+
launch_wandb_config_parameters=launch_wandb_config_parameters
|
425
|
+
)
|
426
|
+
self._publish(rec)
|
427
|
+
|
418
428
|
def _communicate_status(
|
419
429
|
self, status: pb.StatusRequest
|
420
430
|
) -> Optional[pb.StatusResponse]:
|
@@ -523,10 +533,6 @@ class InterfaceShared(InterfaceBase):
|
|
523
533
|
record = self._make_request(run_status=run_status)
|
524
534
|
return self._deliver_record(record)
|
525
535
|
|
526
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
527
|
-
record = self._make_request(job_info=job_info)
|
528
|
-
return self._deliver_record(record)
|
529
|
-
|
530
536
|
def _transport_keepalive_failed(self, keepalive_interval: int = 5) -> bool:
|
531
537
|
if self._transport_failed:
|
532
538
|
return True
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import base64
|
2
2
|
import functools
|
3
3
|
import itertools
|
4
|
+
import json
|
4
5
|
import logging
|
5
6
|
import os
|
6
7
|
import queue
|
@@ -58,6 +59,7 @@ class Chunk(NamedTuple):
|
|
58
59
|
class DefaultFilePolicy:
|
59
60
|
def __init__(self, start_chunk_id: int = 0) -> None:
|
60
61
|
self._chunk_id = start_chunk_id
|
62
|
+
self.has_debug_log = False
|
61
63
|
|
62
64
|
def process_chunks(
|
63
65
|
self, chunks: List[Chunk]
|
@@ -66,6 +68,21 @@ class DefaultFilePolicy:
|
|
66
68
|
self._chunk_id += len(chunks)
|
67
69
|
return {"offset": chunk_id, "content": [c.data for c in chunks]}
|
68
70
|
|
71
|
+
# TODO: this is very inefficient, this is meant for temporary debugging and will be removed in future releases
|
72
|
+
def _debug_log(self, data: Any):
|
73
|
+
if self.has_debug_log or not os.environ.get("WANDB_DEBUG_FILESTREAM_LOG"):
|
74
|
+
return
|
75
|
+
|
76
|
+
loaded = json.loads(data)
|
77
|
+
if not isinstance(loaded, dict):
|
78
|
+
return
|
79
|
+
|
80
|
+
# get key size and convert to MB
|
81
|
+
key_sizes = [(k, len(json.dumps(v))) for k, v in loaded.items()]
|
82
|
+
key_msg = [f"{k}: {v/1048576:.5f} MB" for k, v in key_sizes]
|
83
|
+
wandb.termerror(f"Step: {loaded['_step']} | {key_msg}", repeat=False)
|
84
|
+
self.has_debug_log = True
|
85
|
+
|
69
86
|
|
70
87
|
class JsonlFilePolicy(DefaultFilePolicy):
|
71
88
|
def process_chunks(self, chunks: List[Chunk]) -> "ProcessedChunk":
|
@@ -81,6 +98,7 @@ class JsonlFilePolicy(DefaultFilePolicy):
|
|
81
98
|
)
|
82
99
|
wandb.termerror(msg, repeat=False)
|
83
100
|
wandb._sentry.message(msg, repeat=False)
|
101
|
+
self._debug_log(chunk.data)
|
84
102
|
else:
|
85
103
|
chunk_data.append(chunk.data)
|
86
104
|
|
@@ -99,6 +117,7 @@ class SummaryFilePolicy(DefaultFilePolicy):
|
|
99
117
|
)
|
100
118
|
wandb.termerror(msg, repeat=False)
|
101
119
|
wandb._sentry.message(msg, repeat=False)
|
120
|
+
self._debug_log(data)
|
102
121
|
return False
|
103
122
|
return {"offset": 0, "content": [data]}
|
104
123
|
|
wandb/sdk/internal/handler.py
CHANGED
@@ -689,7 +689,7 @@ class HandleManager:
|
|
689
689
|
self._settings, interface=self._interface, run_proto=run_start.run
|
690
690
|
)
|
691
691
|
|
692
|
-
if run_start.run.resumed:
|
692
|
+
if run_start.run.resumed or run_start.run.forked:
|
693
693
|
self._step = run_start.run.starting_step
|
694
694
|
result = proto_util._result_from_record(record)
|
695
695
|
self._respond_result(result)
|
@@ -862,9 +862,6 @@ class HandleManager:
|
|
862
862
|
self._respond_result(result)
|
863
863
|
self._stopped.set()
|
864
864
|
|
865
|
-
def handle_request_job_info(self, record: Record) -> None:
|
866
|
-
self._dispatch_record(record, always_send=True)
|
867
|
-
|
868
865
|
def finish(self) -> None:
|
869
866
|
logger.info("shutting down handler")
|
870
867
|
if self._system_monitor is not None:
|
@@ -2150,6 +2150,7 @@ class Api:
|
|
2150
2150
|
name
|
2151
2151
|
}
|
2152
2152
|
}
|
2153
|
+
historyLineCount
|
2153
2154
|
}
|
2154
2155
|
inserted
|
2155
2156
|
_Server_Settings_
|
@@ -2237,6 +2238,7 @@ class Api:
|
|
2237
2238
|
.get("serverSettings", {})
|
2238
2239
|
.get("serverMessages", [])
|
2239
2240
|
)
|
2241
|
+
|
2240
2242
|
return (
|
2241
2243
|
response["upsertBucket"]["bucket"],
|
2242
2244
|
response["upsertBucket"]["inserted"],
|
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import re
|
6
6
|
import sys
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
import wandb
|
10
10
|
from wandb.sdk.artifacts.artifact import Artifact
|
@@ -28,6 +28,8 @@ FROZEN_REQUIREMENTS_FNAME = "requirements.frozen.txt"
|
|
28
28
|
JOB_FNAME = "wandb-job.json"
|
29
29
|
JOB_ARTIFACT_TYPE = "job"
|
30
30
|
|
31
|
+
LOG_LEVEL = Literal["log", "warn", "error"]
|
32
|
+
|
31
33
|
|
32
34
|
class GitInfo(TypedDict):
|
33
35
|
remote: str
|
@@ -89,8 +91,9 @@ class JobBuilder:
|
|
89
91
|
_job_seq_id: Optional[str]
|
90
92
|
_job_version_alias: Optional[str]
|
91
93
|
_is_notebook_run: bool
|
94
|
+
_verbose: bool
|
92
95
|
|
93
|
-
def __init__(self, settings: SettingsStatic):
|
96
|
+
def __init__(self, settings: SettingsStatic, verbose: bool = False):
|
94
97
|
self._settings = settings
|
95
98
|
self._metadatafile_path = None
|
96
99
|
self._requirements_path = None
|
@@ -106,6 +109,7 @@ class JobBuilder:
|
|
106
109
|
Literal["repo", "artifact", "image"]
|
107
110
|
] = settings.job_source # type: ignore[assignment]
|
108
111
|
self._is_notebook_run = self._get_is_notebook_run()
|
112
|
+
self._verbose = verbose
|
109
113
|
|
110
114
|
def set_config(self, config: Dict[str, Any]) -> None:
|
111
115
|
self._config = config
|
@@ -197,6 +201,21 @@ class JobBuilder:
|
|
197
201
|
|
198
202
|
return source, name
|
199
203
|
|
204
|
+
def _log_if_verbose(self, message: str, level: LOG_LEVEL) -> None:
|
205
|
+
log_func: Optional[Union[Callable[[Any], None], Callable[[Any], None]]] = None
|
206
|
+
if level == "log":
|
207
|
+
_logger.info(message)
|
208
|
+
log_func = wandb.termlog
|
209
|
+
elif level == "warn":
|
210
|
+
_logger.warning(message)
|
211
|
+
log_func = wandb.termwarn
|
212
|
+
elif level == "error":
|
213
|
+
_logger.error(message)
|
214
|
+
log_func = wandb.termerror
|
215
|
+
|
216
|
+
if self._verbose and log_func is not None:
|
217
|
+
log_func(message)
|
218
|
+
|
200
219
|
def _build_artifact_job_source(
|
201
220
|
self,
|
202
221
|
program_relpath: str,
|
@@ -212,8 +231,9 @@ class JobBuilder:
|
|
212
231
|
# at the directory the notebook is in instead of the jupyter core
|
213
232
|
if not os.path.exists(os.path.basename(program_relpath)):
|
214
233
|
_logger.info("target path does not exist, exiting")
|
215
|
-
|
216
|
-
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job"
|
234
|
+
self._log_if_verbose(
|
235
|
+
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job",
|
236
|
+
"warn",
|
217
237
|
)
|
218
238
|
return None, None
|
219
239
|
full_program_relpath = os.path.basename(program_relpath)
|
@@ -299,22 +319,25 @@ class JobBuilder:
|
|
299
319
|
if not os.path.exists(
|
300
320
|
os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
|
301
321
|
):
|
302
|
-
|
303
|
-
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
322
|
+
self._log_if_verbose(
|
323
|
+
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
324
|
+
"warn",
|
304
325
|
)
|
305
326
|
return None
|
306
327
|
metadata = self._handle_metadata_file()
|
307
328
|
if metadata is None:
|
308
|
-
|
309
|
-
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables"
|
329
|
+
self._log_if_verbose(
|
330
|
+
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables",
|
331
|
+
"warn",
|
310
332
|
)
|
311
333
|
return None
|
312
334
|
|
313
335
|
runtime: Optional[str] = metadata.get("python")
|
314
336
|
# can't build a job without a python version
|
315
337
|
if runtime is None:
|
316
|
-
|
317
|
-
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
338
|
+
self._log_if_verbose(
|
339
|
+
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
340
|
+
"warn",
|
318
341
|
)
|
319
342
|
return None
|
320
343
|
|
@@ -345,13 +368,16 @@ class JobBuilder:
|
|
345
368
|
or self._settings.job_source
|
346
369
|
or self._source_type
|
347
370
|
):
|
348
|
-
|
371
|
+
self._log_if_verbose(
|
372
|
+
"No source type found, not creating job artifact", "warn"
|
373
|
+
)
|
349
374
|
return None
|
350
375
|
|
351
376
|
program_relpath = self._get_program_relpath(source_type, metadata)
|
352
377
|
if source_type != "image" and not program_relpath:
|
353
|
-
|
354
|
-
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
378
|
+
self._log_if_verbose(
|
379
|
+
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
380
|
+
"warn",
|
355
381
|
)
|
356
382
|
return None
|
357
383
|
|
@@ -377,10 +403,11 @@ class JobBuilder:
|
|
377
403
|
|
378
404
|
if source is None:
|
379
405
|
if source_type:
|
380
|
-
|
406
|
+
self._log_if_verbose(
|
381
407
|
f"Source type is set to '{source_type}' but some required information is missing "
|
382
408
|
"from the environment. A job will not be created from this run. See "
|
383
|
-
"https://docs.wandb.ai/guides/launch/create-job"
|
409
|
+
"https://docs.wandb.ai/guides/launch/create-job",
|
410
|
+
"warn",
|
384
411
|
)
|
385
412
|
return None
|
386
413
|
|
@@ -447,8 +474,9 @@ class JobBuilder:
|
|
447
474
|
program = metadata.get("program")
|
448
475
|
|
449
476
|
if not program:
|
450
|
-
|
451
|
-
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job"
|
477
|
+
self._log_if_verbose(
|
478
|
+
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job",
|
479
|
+
"warn",
|
452
480
|
)
|
453
481
|
|
454
482
|
return program
|
wandb/sdk/internal/sender.py
CHANGED
@@ -115,6 +115,7 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
|
115
115
|
"ref": content.ref if content.ref else None,
|
116
116
|
"size": content.size if content.size is not None else None,
|
117
117
|
"local_path": content.local_path if content.local_path else None,
|
118
|
+
"skip_cache": content.skip_cache,
|
118
119
|
"extra": {
|
119
120
|
extra.key: json.loads(extra.value_json) for extra in content.extra
|
120
121
|
},
|
@@ -733,18 +734,7 @@ class SendManager:
|
|
733
734
|
)
|
734
735
|
self._respond_result(result)
|
735
736
|
|
736
|
-
def
|
737
|
-
"""Respond to a request for a job link."""
|
738
|
-
result = proto_util._result_from_record(record)
|
739
|
-
result.response.job_info_response.sequenceId = (
|
740
|
-
self._job_builder._job_seq_id or ""
|
741
|
-
)
|
742
|
-
result.response.job_info_response.version = (
|
743
|
-
self._job_builder._job_version_alias or ""
|
744
|
-
)
|
745
|
-
self._respond_result(result)
|
746
|
-
|
747
|
-
def _maybe_setup_resume(
|
737
|
+
def _setup_resume(
|
748
738
|
self, run: "RunRecord"
|
749
739
|
) -> Optional["wandb_internal_pb2.ErrorInfo"]:
|
750
740
|
"""Queries the backend for a run; fail if the settings are incompatible."""
|
@@ -890,6 +880,30 @@ class SendManager:
|
|
890
880
|
pass
|
891
881
|
# TODO: do something if sync spell is not successful?
|
892
882
|
|
883
|
+
def _setup_fork(self, server_run: dict):
|
884
|
+
assert self._settings.fork_from
|
885
|
+
assert self._settings.fork_from.metric == "_step"
|
886
|
+
assert self._run
|
887
|
+
first_step = int(self._settings.fork_from.value) + 1
|
888
|
+
self._resume_state.step = first_step
|
889
|
+
self._resume_state.history = server_run.get("historyLineCount", 0)
|
890
|
+
self._run.forked = True
|
891
|
+
self._run.starting_step = first_step
|
892
|
+
|
893
|
+
def _handle_error(
|
894
|
+
self,
|
895
|
+
record: "Record",
|
896
|
+
error: "wandb_internal_pb2.ErrorInfo",
|
897
|
+
run: "RunRecord",
|
898
|
+
) -> None:
|
899
|
+
if record.control.req_resp or record.control.mailbox_slot:
|
900
|
+
result = proto_util._result_from_record(record)
|
901
|
+
result.run_result.run.CopyFrom(run)
|
902
|
+
result.run_result.error.CopyFrom(error)
|
903
|
+
self._respond_result(result)
|
904
|
+
else:
|
905
|
+
logger.error("Got error in async mode: %s", error.message)
|
906
|
+
|
893
907
|
def send_run(self, record: "Record", file_dir: Optional[str] = None) -> None:
|
894
908
|
run = record.run
|
895
909
|
error = None
|
@@ -911,21 +925,28 @@ class SendManager:
|
|
911
925
|
config_value_dict = self._config_backend_dict()
|
912
926
|
self._config_save(config_value_dict)
|
913
927
|
|
928
|
+
do_fork = self._settings.fork_from is not None and is_wandb_init
|
929
|
+
do_resume = bool(self._settings.resume)
|
930
|
+
|
931
|
+
if do_fork and do_resume:
|
932
|
+
error = wandb_internal_pb2.ErrorInfo()
|
933
|
+
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
934
|
+
error.message = (
|
935
|
+
"You cannot use `resume` and `fork_from` together. Please choose one."
|
936
|
+
)
|
937
|
+
self._handle_error(record, error, run)
|
938
|
+
|
914
939
|
if is_wandb_init:
|
915
940
|
# Ensure we have a project to query for status
|
916
941
|
if run.project == "":
|
917
942
|
run.project = util.auto_project_name(self._settings.program)
|
918
943
|
# Only check resume status on `wandb.init`
|
919
|
-
|
944
|
+
|
945
|
+
if do_resume:
|
946
|
+
error = self._setup_resume(run)
|
920
947
|
|
921
948
|
if error is not None:
|
922
|
-
|
923
|
-
result = proto_util._result_from_record(record)
|
924
|
-
result.run_result.run.CopyFrom(run)
|
925
|
-
result.run_result.error.CopyFrom(error)
|
926
|
-
self._respond_result(result)
|
927
|
-
else:
|
928
|
-
logger.error("Got error in async mode: %s", error.message)
|
949
|
+
self._handle_error(record, error, run)
|
929
950
|
return
|
930
951
|
|
931
952
|
# Save the resumed config
|
@@ -945,19 +966,22 @@ class SendManager:
|
|
945
966
|
self._config_save(config_value_dict)
|
946
967
|
|
947
968
|
try:
|
948
|
-
self._init_run(run, config_value_dict)
|
969
|
+
server_run = self._init_run(run, config_value_dict)
|
949
970
|
except (CommError, UsageError) as e:
|
950
971
|
logger.error(e, exc_info=True)
|
951
|
-
|
952
|
-
|
953
|
-
result.run_result.run.CopyFrom(run)
|
954
|
-
error = ProtobufErrorHandler.from_exception(e)
|
955
|
-
result.run_result.error.CopyFrom(error)
|
956
|
-
self._respond_result(result)
|
972
|
+
error = ProtobufErrorHandler.from_exception(e)
|
973
|
+
self._handle_error(record, error, run)
|
957
974
|
return
|
958
975
|
|
959
976
|
assert self._run # self._run is configured in _init_run()
|
960
977
|
|
978
|
+
if do_fork:
|
979
|
+
error = self._setup_fork(server_run)
|
980
|
+
|
981
|
+
if error is not None:
|
982
|
+
self._handle_error(record, error, run)
|
983
|
+
return
|
984
|
+
|
961
985
|
if record.control.req_resp or record.control.mailbox_slot:
|
962
986
|
result = proto_util._result_from_record(record)
|
963
987
|
# TODO: we could do self._interface.publish_defer(resp) to notify
|
@@ -976,7 +1000,7 @@ class SendManager:
|
|
976
1000
|
self,
|
977
1001
|
run: "RunRecord",
|
978
1002
|
config_dict: Optional[sender_config.BackendConfigDict],
|
979
|
-
) ->
|
1003
|
+
) -> dict:
|
980
1004
|
# We subtract the previous runs runtime when resuming
|
981
1005
|
start_time = (
|
982
1006
|
run.start_time.ToMicroseconds() / 1e6
|
@@ -1061,6 +1085,7 @@ class SendManager:
|
|
1061
1085
|
self._run.sweep_id = sweep_id
|
1062
1086
|
if os.getenv("SPELL_RUN_URL"):
|
1063
1087
|
self._sync_spell()
|
1088
|
+
return server_run
|
1064
1089
|
|
1065
1090
|
def _start_run_threads(self, file_dir: Optional[str] = None) -> None:
|
1066
1091
|
assert self._run # self._run is configured by caller
|
@@ -2,6 +2,7 @@ from dataclasses import fields
|
|
2
2
|
from typing import Any, Iterable, Sequence, Tuple
|
3
3
|
|
4
4
|
from wandb.proto import wandb_settings_pb2
|
5
|
+
from wandb.sdk.lib import RunMoment
|
5
6
|
from wandb.sdk.wandb_settings import SettingsData
|
6
7
|
|
7
8
|
|
@@ -38,6 +39,14 @@ class SettingsStatic(SettingsData):
|
|
38
39
|
unpacked_inner[inner_key] = inner_value
|
39
40
|
unpacked_mapping[outer_key] = unpacked_inner
|
40
41
|
value = unpacked_mapping
|
42
|
+
elif key == "fork_from":
|
43
|
+
value = getattr(proto, key)
|
44
|
+
if value.run:
|
45
|
+
value = RunMoment(
|
46
|
+
run=value.run, value=value.value, metric=value.metric
|
47
|
+
)
|
48
|
+
else:
|
49
|
+
value = None
|
41
50
|
else:
|
42
51
|
if proto.HasField(key): # type: ignore [arg-type]
|
43
52
|
value = getattr(proto, key).value
|
@@ -212,7 +212,10 @@ class SystemInfo:
|
|
212
212
|
os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
|
213
213
|
) as f:
|
214
214
|
subprocess.call(
|
215
|
-
["conda", "env", "export"],
|
215
|
+
["conda", "env", "export"],
|
216
|
+
stdout=f,
|
217
|
+
stderr=subprocess.DEVNULL,
|
218
|
+
timeout=15, # add timeout since conda env export could take a really long time
|
216
219
|
)
|
217
220
|
except Exception as e:
|
218
221
|
logger.exception(f"Error saving conda packages: {e}")
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -62,6 +62,7 @@ def resolve_agent_config( # noqa: C901
|
|
62
62
|
max_jobs: Optional[int],
|
63
63
|
queues: Optional[Tuple[str]],
|
64
64
|
config: Optional[str],
|
65
|
+
verbosity: Optional[int],
|
65
66
|
) -> Tuple[Dict[str, Any], Api]:
|
66
67
|
"""Resolve the agent config.
|
67
68
|
|
@@ -72,6 +73,7 @@ def resolve_agent_config( # noqa: C901
|
|
72
73
|
max_jobs (int): The max number of jobs.
|
73
74
|
queues (Tuple[str]): The queues.
|
74
75
|
config (str): The config.
|
76
|
+
verbosity (int): How verbose to print, 0 or None = default, 1 = print status every 20 seconds, 2 = also print debugging information
|
75
77
|
|
76
78
|
Returns:
|
77
79
|
Tuple[Dict[str, Any], Api]: The resolved config and api.
|
@@ -83,6 +85,7 @@ def resolve_agent_config( # noqa: C901
|
|
83
85
|
"queues": [],
|
84
86
|
"registry": {},
|
85
87
|
"builder": {},
|
88
|
+
"verbosity": 0,
|
86
89
|
}
|
87
90
|
user_set_project = False
|
88
91
|
resolved_config: Dict[str, Any] = defaults
|
@@ -123,6 +126,8 @@ def resolve_agent_config( # noqa: C901
|
|
123
126
|
resolved_config.update({"max_jobs": int(max_jobs)})
|
124
127
|
if queues:
|
125
128
|
resolved_config.update({"queues": list(queues)})
|
129
|
+
if verbosity:
|
130
|
+
resolved_config.update({"verbosity": int(verbosity)})
|
126
131
|
# queue -> queues
|
127
132
|
if resolved_config.get("queue"):
|
128
133
|
if isinstance(resolved_config.get("queue"), str):
|
@@ -14,6 +14,7 @@ import wandb.docker as docker
|
|
14
14
|
from wandb.apis.internal import Api
|
15
15
|
from wandb.errors import CommError
|
16
16
|
from wandb.sdk.launch import utils
|
17
|
+
from wandb.sdk.launch.utils import get_entrypoint_file
|
17
18
|
from wandb.sdk.lib.runid import generate_id
|
18
19
|
|
19
20
|
from .errors import LaunchError
|
@@ -135,7 +136,7 @@ class LaunchProject:
|
|
135
136
|
if override_entrypoint:
|
136
137
|
_logger.info("Adding override entry point")
|
137
138
|
self.override_entrypoint = EntryPoint(
|
138
|
-
name=
|
139
|
+
name=get_entrypoint_file(override_entrypoint),
|
139
140
|
command=override_entrypoint,
|
140
141
|
)
|
141
142
|
|
@@ -536,24 +537,6 @@ class LaunchProject:
|
|
536
537
|
self.git_version = branch_name
|
537
538
|
|
538
539
|
|
539
|
-
def _get_entrypoint_file(entrypoint: List[str]) -> Optional[str]:
|
540
|
-
"""Get the entrypoint file from the given command.
|
541
|
-
|
542
|
-
Args:
|
543
|
-
entrypoint (List[str]): List of command and arguments.
|
544
|
-
|
545
|
-
Returns:
|
546
|
-
Optional[str]: The entrypoint file if found, otherwise None.
|
547
|
-
"""
|
548
|
-
if not entrypoint:
|
549
|
-
return None
|
550
|
-
if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
|
551
|
-
return entrypoint[0]
|
552
|
-
if len(entrypoint) < 2:
|
553
|
-
return None
|
554
|
-
return entrypoint[1]
|
555
|
-
|
556
|
-
|
557
540
|
class EntryPoint:
|
558
541
|
"""An entry point into a wandb launch specification."""
|
559
542
|
|
@@ -570,7 +553,9 @@ class EntryPoint:
|
|
570
553
|
|
571
554
|
def update_entrypoint_path(self, new_path: str) -> None:
|
572
555
|
"""Updates the entrypoint path to a new path."""
|
573
|
-
if len(self.command) == 2 and
|
556
|
+
if len(self.command) == 2 and (
|
557
|
+
self.command[0].startswith("python") or self.command[0] == "bash"
|
558
|
+
):
|
574
559
|
self.command[1] = new_path
|
575
560
|
|
576
561
|
|