wandb 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/public/api.py +6 -6
- wandb/apis/reports/v2/interface.py +4 -8
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +29 -5
- wandb/integration/openai/fine_tuning.py +74 -37
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +92 -26
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
- wandb/sdk/artifacts/artifact_saver.py +16 -36
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +13 -5
- wandb/sdk/interface/interface.py +60 -15
- wandb/sdk/interface/interface_shared.py +13 -7
- wandb/sdk/internal/file_stream.py +19 -0
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +2 -0
- wandb/sdk/internal/job_builder.py +45 -17
- wandb/sdk/internal/sender.py +53 -28
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +4 -1
- wandb/sdk/launch/_launch.py +5 -0
- wandb/sdk/launch/_project_spec.py +5 -20
- wandb/sdk/launch/agent/agent.py +80 -37
- wandb/sdk/launch/agent/config.py +8 -0
- wandb/sdk/launch/builder/kaniko_builder.py +149 -134
- wandb/sdk/launch/create_job.py +44 -48
- wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/sweeps/scheduler.py +3 -1
- wandb/sdk/launch/utils.py +23 -5
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +2 -0
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +78 -0
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/wandb_init.py +12 -7
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +179 -94
- wandb/sdk/wandb_settings.py +55 -16
- wandb/testing/relay.py +5 -6
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/RECORD +55 -54
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/WHEEL +1 -1
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0
wandb/sdk/interface/interface.py
CHANGED
@@ -13,8 +13,19 @@ import os
|
|
13
13
|
import sys
|
14
14
|
import time
|
15
15
|
from abc import abstractmethod
|
16
|
-
from typing import
|
16
|
+
from typing import (
|
17
|
+
TYPE_CHECKING,
|
18
|
+
Any,
|
19
|
+
Dict,
|
20
|
+
Iterable,
|
21
|
+
List,
|
22
|
+
NewType,
|
23
|
+
Optional,
|
24
|
+
Tuple,
|
25
|
+
Union,
|
26
|
+
)
|
17
27
|
|
28
|
+
from wandb import termwarn
|
18
29
|
from wandb.proto import wandb_internal_pb2 as pb
|
19
30
|
from wandb.proto import wandb_telemetry_pb2 as tpb
|
20
31
|
from wandb.sdk.artifacts.artifact import Artifact
|
@@ -340,6 +351,7 @@ class InterfaceBase:
|
|
340
351
|
proto_entry.ref = entry.ref
|
341
352
|
if entry.local_path:
|
342
353
|
proto_entry.local_path = entry.local_path
|
354
|
+
proto_entry.skip_cache = entry.skip_cache
|
343
355
|
for k, v in entry.extra.items():
|
344
356
|
proto_extra = proto_entry.extra.add()
|
345
357
|
proto_extra.key = k
|
@@ -436,16 +448,27 @@ class InterfaceBase:
|
|
436
448
|
path = artifact.get_entry("wandb-job.json").download()
|
437
449
|
with open(path) as f:
|
438
450
|
job_info = json.load(f)
|
451
|
+
|
439
452
|
except Exception as e:
|
440
453
|
logger.warning(
|
441
454
|
f"Failed to download partial job info from artifact {artifact}, : {e}"
|
442
455
|
)
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
456
|
+
termwarn(
|
457
|
+
f"Failed to download partial job info from artifact {artifact}, : {e}"
|
458
|
+
)
|
459
|
+
return
|
460
|
+
|
461
|
+
try:
|
462
|
+
use_artifact = self._make_proto_use_artifact(
|
463
|
+
use_artifact=use_artifact,
|
464
|
+
job_name=artifact.name,
|
465
|
+
job_info=job_info,
|
466
|
+
metadata=artifact.metadata,
|
467
|
+
)
|
468
|
+
except Exception as e:
|
469
|
+
logger.warning(f"Failed to construct use artifact proto: {e}")
|
470
|
+
termwarn(f"Failed to construct use artifact proto: {e}")
|
471
|
+
return
|
449
472
|
|
450
473
|
self._publish_use_artifact(use_artifact)
|
451
474
|
|
@@ -756,6 +779,36 @@ class InterfaceBase:
|
|
756
779
|
run_start.run.CopyFrom(run_pb)
|
757
780
|
return self._deliver_run_start(run_start)
|
758
781
|
|
782
|
+
def publish_launch_wandb_config_parameters(
|
783
|
+
self, include_paths: List[List[str]], exclude_paths: List[List[str]]
|
784
|
+
):
|
785
|
+
"""Tells the internal process to treat wandb.config fields as job inputs.
|
786
|
+
|
787
|
+
The paths provided as arguments are sequences of dictionary keys that
|
788
|
+
specify a path within the wandb.config. If a path is included, the
|
789
|
+
corresponding field will be treated as a job input. If a path is
|
790
|
+
excluded, the corresponding field will not be treated as a job input.
|
791
|
+
|
792
|
+
Args:
|
793
|
+
include_paths: paths within config to include as job inputs.
|
794
|
+
exclude_paths: paths within config to exclude as job inputs.
|
795
|
+
|
796
|
+
Returns:
|
797
|
+
None
|
798
|
+
"""
|
799
|
+
config_parameters = pb.LaunchWandbConfigParametersRecord()
|
800
|
+
include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
|
801
|
+
exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
|
802
|
+
config_parameters.include_paths.extend(include_records)
|
803
|
+
config_parameters.exclude_paths.extend(exclude_records)
|
804
|
+
return self._publish_launch_wandb_config_parameters(config_parameters)
|
805
|
+
|
806
|
+
@abstractmethod
|
807
|
+
def _publish_launch_wandb_config_parameters(
|
808
|
+
self, config_parameters: pb.LaunchWandbConfigParametersRecord
|
809
|
+
) -> None:
|
810
|
+
raise NotImplementedError
|
811
|
+
|
759
812
|
@abstractmethod
|
760
813
|
def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
|
761
814
|
raise NotImplementedError
|
@@ -871,11 +924,3 @@ class InterfaceBase:
|
|
871
924
|
self, run_status: pb.RunStatusRequest
|
872
925
|
) -> MailboxHandle:
|
873
926
|
raise NotImplementedError
|
874
|
-
|
875
|
-
def deliver_request_job_info(self) -> MailboxHandle:
|
876
|
-
job_info = pb.JobInfoRequest()
|
877
|
-
return self._deliver_request_job_info(job_info)
|
878
|
-
|
879
|
-
@abstractmethod
|
880
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
881
|
-
raise NotImplementedError
|
@@ -145,7 +145,6 @@ class InterfaceShared(InterfaceBase):
|
|
145
145
|
cancel: Optional[pb.CancelRequest] = None,
|
146
146
|
summary_record: Optional[pb.SummaryRecordRequest] = None,
|
147
147
|
telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
|
148
|
-
job_info: Optional[pb.JobInfoRequest] = None,
|
149
148
|
get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
|
150
149
|
python_packages: Optional[pb.PythonPackagesRequest] = None,
|
151
150
|
) -> pb.Record:
|
@@ -202,8 +201,6 @@ class InterfaceShared(InterfaceBase):
|
|
202
201
|
request.summary_record.CopyFrom(summary_record)
|
203
202
|
elif telemetry_record:
|
204
203
|
request.telemetry_record.CopyFrom(telemetry_record)
|
205
|
-
elif job_info:
|
206
|
-
request.job_info.CopyFrom(job_info)
|
207
204
|
elif get_system_metrics:
|
208
205
|
request.get_system_metrics.CopyFrom(get_system_metrics)
|
209
206
|
elif sync:
|
@@ -242,6 +239,9 @@ class InterfaceShared(InterfaceBase):
|
|
242
239
|
use_artifact: Optional[pb.UseArtifactRecord] = None,
|
243
240
|
output: Optional[pb.OutputRecord] = None,
|
244
241
|
output_raw: Optional[pb.OutputRawRecord] = None,
|
242
|
+
launch_wandb_config_parameters: Optional[
|
243
|
+
pb.LaunchWandbConfigParametersRecord
|
244
|
+
] = None,
|
245
245
|
) -> pb.Record:
|
246
246
|
record = pb.Record()
|
247
247
|
if run:
|
@@ -286,6 +286,8 @@ class InterfaceShared(InterfaceBase):
|
|
286
286
|
record.output.CopyFrom(output)
|
287
287
|
elif output_raw:
|
288
288
|
record.output_raw.CopyFrom(output_raw)
|
289
|
+
elif launch_wandb_config_parameters:
|
290
|
+
record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
|
289
291
|
else:
|
290
292
|
raise Exception("Invalid record")
|
291
293
|
return record
|
@@ -415,6 +417,14 @@ class InterfaceShared(InterfaceBase):
|
|
415
417
|
rec = self._make_record(alert=proto_alert)
|
416
418
|
self._publish(rec)
|
417
419
|
|
420
|
+
def _publish_launch_wandb_config_parameters(
|
421
|
+
self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
|
422
|
+
) -> None:
|
423
|
+
rec = self._make_record(
|
424
|
+
launch_wandb_config_parameters=launch_wandb_config_parameters
|
425
|
+
)
|
426
|
+
self._publish(rec)
|
427
|
+
|
418
428
|
def _communicate_status(
|
419
429
|
self, status: pb.StatusRequest
|
420
430
|
) -> Optional[pb.StatusResponse]:
|
@@ -523,10 +533,6 @@ class InterfaceShared(InterfaceBase):
|
|
523
533
|
record = self._make_request(run_status=run_status)
|
524
534
|
return self._deliver_record(record)
|
525
535
|
|
526
|
-
def _deliver_request_job_info(self, job_info: pb.JobInfoRequest) -> MailboxHandle:
|
527
|
-
record = self._make_request(job_info=job_info)
|
528
|
-
return self._deliver_record(record)
|
529
|
-
|
530
536
|
def _transport_keepalive_failed(self, keepalive_interval: int = 5) -> bool:
|
531
537
|
if self._transport_failed:
|
532
538
|
return True
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import base64
|
2
2
|
import functools
|
3
3
|
import itertools
|
4
|
+
import json
|
4
5
|
import logging
|
5
6
|
import os
|
6
7
|
import queue
|
@@ -58,6 +59,7 @@ class Chunk(NamedTuple):
|
|
58
59
|
class DefaultFilePolicy:
|
59
60
|
def __init__(self, start_chunk_id: int = 0) -> None:
|
60
61
|
self._chunk_id = start_chunk_id
|
62
|
+
self.has_debug_log = False
|
61
63
|
|
62
64
|
def process_chunks(
|
63
65
|
self, chunks: List[Chunk]
|
@@ -66,6 +68,21 @@ class DefaultFilePolicy:
|
|
66
68
|
self._chunk_id += len(chunks)
|
67
69
|
return {"offset": chunk_id, "content": [c.data for c in chunks]}
|
68
70
|
|
71
|
+
# TODO: this is very inefficient, this is meant for temporary debugging and will be removed in future releases
|
72
|
+
def _debug_log(self, data: Any):
|
73
|
+
if self.has_debug_log or not os.environ.get("WANDB_DEBUG_FILESTREAM_LOG"):
|
74
|
+
return
|
75
|
+
|
76
|
+
loaded = json.loads(data)
|
77
|
+
if not isinstance(loaded, dict):
|
78
|
+
return
|
79
|
+
|
80
|
+
# get key size and convert to MB
|
81
|
+
key_sizes = [(k, len(json.dumps(v))) for k, v in loaded.items()]
|
82
|
+
key_msg = [f"{k}: {v/1048576:.5f} MB" for k, v in key_sizes]
|
83
|
+
wandb.termerror(f"Step: {loaded['_step']} | {key_msg}", repeat=False)
|
84
|
+
self.has_debug_log = True
|
85
|
+
|
69
86
|
|
70
87
|
class JsonlFilePolicy(DefaultFilePolicy):
|
71
88
|
def process_chunks(self, chunks: List[Chunk]) -> "ProcessedChunk":
|
@@ -81,6 +98,7 @@ class JsonlFilePolicy(DefaultFilePolicy):
|
|
81
98
|
)
|
82
99
|
wandb.termerror(msg, repeat=False)
|
83
100
|
wandb._sentry.message(msg, repeat=False)
|
101
|
+
self._debug_log(chunk.data)
|
84
102
|
else:
|
85
103
|
chunk_data.append(chunk.data)
|
86
104
|
|
@@ -99,6 +117,7 @@ class SummaryFilePolicy(DefaultFilePolicy):
|
|
99
117
|
)
|
100
118
|
wandb.termerror(msg, repeat=False)
|
101
119
|
wandb._sentry.message(msg, repeat=False)
|
120
|
+
self._debug_log(data)
|
102
121
|
return False
|
103
122
|
return {"offset": 0, "content": [data]}
|
104
123
|
|
wandb/sdk/internal/handler.py
CHANGED
@@ -689,7 +689,7 @@ class HandleManager:
|
|
689
689
|
self._settings, interface=self._interface, run_proto=run_start.run
|
690
690
|
)
|
691
691
|
|
692
|
-
if run_start.run.resumed:
|
692
|
+
if run_start.run.resumed or run_start.run.forked:
|
693
693
|
self._step = run_start.run.starting_step
|
694
694
|
result = proto_util._result_from_record(record)
|
695
695
|
self._respond_result(result)
|
@@ -862,9 +862,6 @@ class HandleManager:
|
|
862
862
|
self._respond_result(result)
|
863
863
|
self._stopped.set()
|
864
864
|
|
865
|
-
def handle_request_job_info(self, record: Record) -> None:
|
866
|
-
self._dispatch_record(record, always_send=True)
|
867
|
-
|
868
865
|
def finish(self) -> None:
|
869
866
|
logger.info("shutting down handler")
|
870
867
|
if self._system_monitor is not None:
|
@@ -2150,6 +2150,7 @@ class Api:
|
|
2150
2150
|
name
|
2151
2151
|
}
|
2152
2152
|
}
|
2153
|
+
historyLineCount
|
2153
2154
|
}
|
2154
2155
|
inserted
|
2155
2156
|
_Server_Settings_
|
@@ -2237,6 +2238,7 @@ class Api:
|
|
2237
2238
|
.get("serverSettings", {})
|
2238
2239
|
.get("serverMessages", [])
|
2239
2240
|
)
|
2241
|
+
|
2240
2242
|
return (
|
2241
2243
|
response["upsertBucket"]["bucket"],
|
2242
2244
|
response["upsertBucket"]["inserted"],
|
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import re
|
6
6
|
import sys
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
import wandb
|
10
10
|
from wandb.sdk.artifacts.artifact import Artifact
|
@@ -28,6 +28,8 @@ FROZEN_REQUIREMENTS_FNAME = "requirements.frozen.txt"
|
|
28
28
|
JOB_FNAME = "wandb-job.json"
|
29
29
|
JOB_ARTIFACT_TYPE = "job"
|
30
30
|
|
31
|
+
LOG_LEVEL = Literal["log", "warn", "error"]
|
32
|
+
|
31
33
|
|
32
34
|
class GitInfo(TypedDict):
|
33
35
|
remote: str
|
@@ -89,8 +91,9 @@ class JobBuilder:
|
|
89
91
|
_job_seq_id: Optional[str]
|
90
92
|
_job_version_alias: Optional[str]
|
91
93
|
_is_notebook_run: bool
|
94
|
+
_verbose: bool
|
92
95
|
|
93
|
-
def __init__(self, settings: SettingsStatic):
|
96
|
+
def __init__(self, settings: SettingsStatic, verbose: bool = False):
|
94
97
|
self._settings = settings
|
95
98
|
self._metadatafile_path = None
|
96
99
|
self._requirements_path = None
|
@@ -106,6 +109,7 @@ class JobBuilder:
|
|
106
109
|
Literal["repo", "artifact", "image"]
|
107
110
|
] = settings.job_source # type: ignore[assignment]
|
108
111
|
self._is_notebook_run = self._get_is_notebook_run()
|
112
|
+
self._verbose = verbose
|
109
113
|
|
110
114
|
def set_config(self, config: Dict[str, Any]) -> None:
|
111
115
|
self._config = config
|
@@ -197,6 +201,21 @@ class JobBuilder:
|
|
197
201
|
|
198
202
|
return source, name
|
199
203
|
|
204
|
+
def _log_if_verbose(self, message: str, level: LOG_LEVEL) -> None:
|
205
|
+
log_func: Optional[Union[Callable[[Any], None], Callable[[Any], None]]] = None
|
206
|
+
if level == "log":
|
207
|
+
_logger.info(message)
|
208
|
+
log_func = wandb.termlog
|
209
|
+
elif level == "warn":
|
210
|
+
_logger.warning(message)
|
211
|
+
log_func = wandb.termwarn
|
212
|
+
elif level == "error":
|
213
|
+
_logger.error(message)
|
214
|
+
log_func = wandb.termerror
|
215
|
+
|
216
|
+
if self._verbose and log_func is not None:
|
217
|
+
log_func(message)
|
218
|
+
|
200
219
|
def _build_artifact_job_source(
|
201
220
|
self,
|
202
221
|
program_relpath: str,
|
@@ -212,8 +231,9 @@ class JobBuilder:
|
|
212
231
|
# at the directory the notebook is in instead of the jupyter core
|
213
232
|
if not os.path.exists(os.path.basename(program_relpath)):
|
214
233
|
_logger.info("target path does not exist, exiting")
|
215
|
-
|
216
|
-
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job"
|
234
|
+
self._log_if_verbose(
|
235
|
+
"No program path found when generating artifact job source for a non-colab notebook run. See https://docs.wandb.ai/guides/launch/create-job",
|
236
|
+
"warn",
|
217
237
|
)
|
218
238
|
return None, None
|
219
239
|
full_program_relpath = os.path.basename(program_relpath)
|
@@ -299,22 +319,25 @@ class JobBuilder:
|
|
299
319
|
if not os.path.exists(
|
300
320
|
os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
|
301
321
|
):
|
302
|
-
|
303
|
-
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
322
|
+
self._log_if_verbose(
|
323
|
+
"No requirements.txt found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
324
|
+
"warn",
|
304
325
|
)
|
305
326
|
return None
|
306
327
|
metadata = self._handle_metadata_file()
|
307
328
|
if metadata is None:
|
308
|
-
|
309
|
-
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables"
|
329
|
+
self._log_if_verbose(
|
330
|
+
f"Ensure read and write access to run files dir: {self._settings.files_dir}, control this via the WANDB_DIR env var. See https://docs.wandb.ai/guides/track/environment-variables",
|
331
|
+
"warn",
|
310
332
|
)
|
311
333
|
return None
|
312
334
|
|
313
335
|
runtime: Optional[str] = metadata.get("python")
|
314
336
|
# can't build a job without a python version
|
315
337
|
if runtime is None:
|
316
|
-
|
317
|
-
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
338
|
+
self._log_if_verbose(
|
339
|
+
"No python version found in metadata, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
340
|
+
"warn",
|
318
341
|
)
|
319
342
|
return None
|
320
343
|
|
@@ -345,13 +368,16 @@ class JobBuilder:
|
|
345
368
|
or self._settings.job_source
|
346
369
|
or self._source_type
|
347
370
|
):
|
348
|
-
|
371
|
+
self._log_if_verbose(
|
372
|
+
"No source type found, not creating job artifact", "warn"
|
373
|
+
)
|
349
374
|
return None
|
350
375
|
|
351
376
|
program_relpath = self._get_program_relpath(source_type, metadata)
|
352
377
|
if source_type != "image" and not program_relpath:
|
353
|
-
|
354
|
-
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"
|
378
|
+
self._log_if_verbose(
|
379
|
+
"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job",
|
380
|
+
"warn",
|
355
381
|
)
|
356
382
|
return None
|
357
383
|
|
@@ -377,10 +403,11 @@ class JobBuilder:
|
|
377
403
|
|
378
404
|
if source is None:
|
379
405
|
if source_type:
|
380
|
-
|
406
|
+
self._log_if_verbose(
|
381
407
|
f"Source type is set to '{source_type}' but some required information is missing "
|
382
408
|
"from the environment. A job will not be created from this run. See "
|
383
|
-
"https://docs.wandb.ai/guides/launch/create-job"
|
409
|
+
"https://docs.wandb.ai/guides/launch/create-job",
|
410
|
+
"warn",
|
384
411
|
)
|
385
412
|
return None
|
386
413
|
|
@@ -447,8 +474,9 @@ class JobBuilder:
|
|
447
474
|
program = metadata.get("program")
|
448
475
|
|
449
476
|
if not program:
|
450
|
-
|
451
|
-
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job"
|
477
|
+
self._log_if_verbose(
|
478
|
+
"Notebook 'program' path not found in metadata. See https://docs.wandb.ai/guides/launch/create-job",
|
479
|
+
"warn",
|
452
480
|
)
|
453
481
|
|
454
482
|
return program
|
wandb/sdk/internal/sender.py
CHANGED
@@ -115,6 +115,7 @@ def _manifest_json_from_proto(manifest: "ArtifactManifest") -> Dict:
|
|
115
115
|
"ref": content.ref if content.ref else None,
|
116
116
|
"size": content.size if content.size is not None else None,
|
117
117
|
"local_path": content.local_path if content.local_path else None,
|
118
|
+
"skip_cache": content.skip_cache,
|
118
119
|
"extra": {
|
119
120
|
extra.key: json.loads(extra.value_json) for extra in content.extra
|
120
121
|
},
|
@@ -733,18 +734,7 @@ class SendManager:
|
|
733
734
|
)
|
734
735
|
self._respond_result(result)
|
735
736
|
|
736
|
-
def
|
737
|
-
"""Respond to a request for a job link."""
|
738
|
-
result = proto_util._result_from_record(record)
|
739
|
-
result.response.job_info_response.sequenceId = (
|
740
|
-
self._job_builder._job_seq_id or ""
|
741
|
-
)
|
742
|
-
result.response.job_info_response.version = (
|
743
|
-
self._job_builder._job_version_alias or ""
|
744
|
-
)
|
745
|
-
self._respond_result(result)
|
746
|
-
|
747
|
-
def _maybe_setup_resume(
|
737
|
+
def _setup_resume(
|
748
738
|
self, run: "RunRecord"
|
749
739
|
) -> Optional["wandb_internal_pb2.ErrorInfo"]:
|
750
740
|
"""Queries the backend for a run; fail if the settings are incompatible."""
|
@@ -890,6 +880,30 @@ class SendManager:
|
|
890
880
|
pass
|
891
881
|
# TODO: do something if sync spell is not successful?
|
892
882
|
|
883
|
+
def _setup_fork(self, server_run: dict):
|
884
|
+
assert self._settings.fork_from
|
885
|
+
assert self._settings.fork_from.metric == "_step"
|
886
|
+
assert self._run
|
887
|
+
first_step = int(self._settings.fork_from.value) + 1
|
888
|
+
self._resume_state.step = first_step
|
889
|
+
self._resume_state.history = server_run.get("historyLineCount", 0)
|
890
|
+
self._run.forked = True
|
891
|
+
self._run.starting_step = first_step
|
892
|
+
|
893
|
+
def _handle_error(
|
894
|
+
self,
|
895
|
+
record: "Record",
|
896
|
+
error: "wandb_internal_pb2.ErrorInfo",
|
897
|
+
run: "RunRecord",
|
898
|
+
) -> None:
|
899
|
+
if record.control.req_resp or record.control.mailbox_slot:
|
900
|
+
result = proto_util._result_from_record(record)
|
901
|
+
result.run_result.run.CopyFrom(run)
|
902
|
+
result.run_result.error.CopyFrom(error)
|
903
|
+
self._respond_result(result)
|
904
|
+
else:
|
905
|
+
logger.error("Got error in async mode: %s", error.message)
|
906
|
+
|
893
907
|
def send_run(self, record: "Record", file_dir: Optional[str] = None) -> None:
|
894
908
|
run = record.run
|
895
909
|
error = None
|
@@ -911,21 +925,28 @@ class SendManager:
|
|
911
925
|
config_value_dict = self._config_backend_dict()
|
912
926
|
self._config_save(config_value_dict)
|
913
927
|
|
928
|
+
do_fork = self._settings.fork_from is not None and is_wandb_init
|
929
|
+
do_resume = bool(self._settings.resume)
|
930
|
+
|
931
|
+
if do_fork and do_resume:
|
932
|
+
error = wandb_internal_pb2.ErrorInfo()
|
933
|
+
error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.USAGE
|
934
|
+
error.message = (
|
935
|
+
"You cannot use `resume` and `fork_from` together. Please choose one."
|
936
|
+
)
|
937
|
+
self._handle_error(record, error, run)
|
938
|
+
|
914
939
|
if is_wandb_init:
|
915
940
|
# Ensure we have a project to query for status
|
916
941
|
if run.project == "":
|
917
942
|
run.project = util.auto_project_name(self._settings.program)
|
918
943
|
# Only check resume status on `wandb.init`
|
919
|
-
|
944
|
+
|
945
|
+
if do_resume:
|
946
|
+
error = self._setup_resume(run)
|
920
947
|
|
921
948
|
if error is not None:
|
922
|
-
|
923
|
-
result = proto_util._result_from_record(record)
|
924
|
-
result.run_result.run.CopyFrom(run)
|
925
|
-
result.run_result.error.CopyFrom(error)
|
926
|
-
self._respond_result(result)
|
927
|
-
else:
|
928
|
-
logger.error("Got error in async mode: %s", error.message)
|
949
|
+
self._handle_error(record, error, run)
|
929
950
|
return
|
930
951
|
|
931
952
|
# Save the resumed config
|
@@ -945,19 +966,22 @@ class SendManager:
|
|
945
966
|
self._config_save(config_value_dict)
|
946
967
|
|
947
968
|
try:
|
948
|
-
self._init_run(run, config_value_dict)
|
969
|
+
server_run = self._init_run(run, config_value_dict)
|
949
970
|
except (CommError, UsageError) as e:
|
950
971
|
logger.error(e, exc_info=True)
|
951
|
-
|
952
|
-
|
953
|
-
result.run_result.run.CopyFrom(run)
|
954
|
-
error = ProtobufErrorHandler.from_exception(e)
|
955
|
-
result.run_result.error.CopyFrom(error)
|
956
|
-
self._respond_result(result)
|
972
|
+
error = ProtobufErrorHandler.from_exception(e)
|
973
|
+
self._handle_error(record, error, run)
|
957
974
|
return
|
958
975
|
|
959
976
|
assert self._run # self._run is configured in _init_run()
|
960
977
|
|
978
|
+
if do_fork:
|
979
|
+
error = self._setup_fork(server_run)
|
980
|
+
|
981
|
+
if error is not None:
|
982
|
+
self._handle_error(record, error, run)
|
983
|
+
return
|
984
|
+
|
961
985
|
if record.control.req_resp or record.control.mailbox_slot:
|
962
986
|
result = proto_util._result_from_record(record)
|
963
987
|
# TODO: we could do self._interface.publish_defer(resp) to notify
|
@@ -976,7 +1000,7 @@ class SendManager:
|
|
976
1000
|
self,
|
977
1001
|
run: "RunRecord",
|
978
1002
|
config_dict: Optional[sender_config.BackendConfigDict],
|
979
|
-
) ->
|
1003
|
+
) -> dict:
|
980
1004
|
# We subtract the previous runs runtime when resuming
|
981
1005
|
start_time = (
|
982
1006
|
run.start_time.ToMicroseconds() / 1e6
|
@@ -1061,6 +1085,7 @@ class SendManager:
|
|
1061
1085
|
self._run.sweep_id = sweep_id
|
1062
1086
|
if os.getenv("SPELL_RUN_URL"):
|
1063
1087
|
self._sync_spell()
|
1088
|
+
return server_run
|
1064
1089
|
|
1065
1090
|
def _start_run_threads(self, file_dir: Optional[str] = None) -> None:
|
1066
1091
|
assert self._run # self._run is configured by caller
|
@@ -2,6 +2,7 @@ from dataclasses import fields
|
|
2
2
|
from typing import Any, Iterable, Sequence, Tuple
|
3
3
|
|
4
4
|
from wandb.proto import wandb_settings_pb2
|
5
|
+
from wandb.sdk.lib import RunMoment
|
5
6
|
from wandb.sdk.wandb_settings import SettingsData
|
6
7
|
|
7
8
|
|
@@ -38,6 +39,14 @@ class SettingsStatic(SettingsData):
|
|
38
39
|
unpacked_inner[inner_key] = inner_value
|
39
40
|
unpacked_mapping[outer_key] = unpacked_inner
|
40
41
|
value = unpacked_mapping
|
42
|
+
elif key == "fork_from":
|
43
|
+
value = getattr(proto, key)
|
44
|
+
if value.run:
|
45
|
+
value = RunMoment(
|
46
|
+
run=value.run, value=value.value, metric=value.metric
|
47
|
+
)
|
48
|
+
else:
|
49
|
+
value = None
|
41
50
|
else:
|
42
51
|
if proto.HasField(key): # type: ignore [arg-type]
|
43
52
|
value = getattr(proto, key).value
|
@@ -212,7 +212,10 @@ class SystemInfo:
|
|
212
212
|
os.path.join(self.settings.files_dir, CONDA_ENVIRONMENTS_FNAME), "w"
|
213
213
|
) as f:
|
214
214
|
subprocess.call(
|
215
|
-
["conda", "env", "export"],
|
215
|
+
["conda", "env", "export"],
|
216
|
+
stdout=f,
|
217
|
+
stderr=subprocess.DEVNULL,
|
218
|
+
timeout=15, # add timeout since conda env export could take a really long time
|
216
219
|
)
|
217
220
|
except Exception as e:
|
218
221
|
logger.exception(f"Error saving conda packages: {e}")
|
wandb/sdk/launch/_launch.py
CHANGED
@@ -62,6 +62,7 @@ def resolve_agent_config( # noqa: C901
|
|
62
62
|
max_jobs: Optional[int],
|
63
63
|
queues: Optional[Tuple[str]],
|
64
64
|
config: Optional[str],
|
65
|
+
verbosity: Optional[int],
|
65
66
|
) -> Tuple[Dict[str, Any], Api]:
|
66
67
|
"""Resolve the agent config.
|
67
68
|
|
@@ -72,6 +73,7 @@ def resolve_agent_config( # noqa: C901
|
|
72
73
|
max_jobs (int): The max number of jobs.
|
73
74
|
queues (Tuple[str]): The queues.
|
74
75
|
config (str): The config.
|
76
|
+
verbosity (int): How verbose to print, 0 or None = default, 1 = print status every 20 seconds, 2 = also print debugging information
|
75
77
|
|
76
78
|
Returns:
|
77
79
|
Tuple[Dict[str, Any], Api]: The resolved config and api.
|
@@ -83,6 +85,7 @@ def resolve_agent_config( # noqa: C901
|
|
83
85
|
"queues": [],
|
84
86
|
"registry": {},
|
85
87
|
"builder": {},
|
88
|
+
"verbosity": 0,
|
86
89
|
}
|
87
90
|
user_set_project = False
|
88
91
|
resolved_config: Dict[str, Any] = defaults
|
@@ -123,6 +126,8 @@ def resolve_agent_config( # noqa: C901
|
|
123
126
|
resolved_config.update({"max_jobs": int(max_jobs)})
|
124
127
|
if queues:
|
125
128
|
resolved_config.update({"queues": list(queues)})
|
129
|
+
if verbosity:
|
130
|
+
resolved_config.update({"verbosity": int(verbosity)})
|
126
131
|
# queue -> queues
|
127
132
|
if resolved_config.get("queue"):
|
128
133
|
if isinstance(resolved_config.get("queue"), str):
|
@@ -14,6 +14,7 @@ import wandb.docker as docker
|
|
14
14
|
from wandb.apis.internal import Api
|
15
15
|
from wandb.errors import CommError
|
16
16
|
from wandb.sdk.launch import utils
|
17
|
+
from wandb.sdk.launch.utils import get_entrypoint_file
|
17
18
|
from wandb.sdk.lib.runid import generate_id
|
18
19
|
|
19
20
|
from .errors import LaunchError
|
@@ -135,7 +136,7 @@ class LaunchProject:
|
|
135
136
|
if override_entrypoint:
|
136
137
|
_logger.info("Adding override entry point")
|
137
138
|
self.override_entrypoint = EntryPoint(
|
138
|
-
name=
|
139
|
+
name=get_entrypoint_file(override_entrypoint),
|
139
140
|
command=override_entrypoint,
|
140
141
|
)
|
141
142
|
|
@@ -536,24 +537,6 @@ class LaunchProject:
|
|
536
537
|
self.git_version = branch_name
|
537
538
|
|
538
539
|
|
539
|
-
def _get_entrypoint_file(entrypoint: List[str]) -> Optional[str]:
|
540
|
-
"""Get the entrypoint file from the given command.
|
541
|
-
|
542
|
-
Args:
|
543
|
-
entrypoint (List[str]): List of command and arguments.
|
544
|
-
|
545
|
-
Returns:
|
546
|
-
Optional[str]: The entrypoint file if found, otherwise None.
|
547
|
-
"""
|
548
|
-
if not entrypoint:
|
549
|
-
return None
|
550
|
-
if entrypoint[0].endswith(".py") or entrypoint[0].endswith(".sh"):
|
551
|
-
return entrypoint[0]
|
552
|
-
if len(entrypoint) < 2:
|
553
|
-
return None
|
554
|
-
return entrypoint[1]
|
555
|
-
|
556
|
-
|
557
540
|
class EntryPoint:
|
558
541
|
"""An entry point into a wandb launch specification."""
|
559
542
|
|
@@ -570,7 +553,9 @@ class EntryPoint:
|
|
570
553
|
|
571
554
|
def update_entrypoint_path(self, new_path: str) -> None:
|
572
555
|
"""Updates the entrypoint path to a new path."""
|
573
|
-
if len(self.command) == 2 and
|
556
|
+
if len(self.command) == 2 and (
|
557
|
+
self.command[0].startswith("python") or self.command[0] == "bash"
|
558
|
+
):
|
574
559
|
self.command[1] = new_path
|
575
560
|
|
576
561
|
|