wandb 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +5 -1
- wandb/apis/public.py +137 -17
- wandb/apis/reports/_panels.py +1 -1
- wandb/apis/reports/blocks.py +1 -0
- wandb/apis/reports/report.py +27 -5
- wandb/cli/cli.py +52 -41
- wandb/docker/__init__.py +17 -0
- wandb/docker/auth.py +1 -1
- wandb/env.py +24 -4
- wandb/filesync/step_checksum.py +3 -3
- wandb/integration/openai/openai.py +3 -0
- wandb/integration/ultralytics/__init__.py +9 -0
- wandb/integration/ultralytics/bbox_utils.py +196 -0
- wandb/integration/ultralytics/callback.py +458 -0
- wandb/integration/ultralytics/classification_utils.py +66 -0
- wandb/integration/ultralytics/mask_utils.py +141 -0
- wandb/integration/ultralytics/pose_utils.py +92 -0
- wandb/integration/xgboost/xgboost.py +3 -3
- wandb/integration/yolov8/__init__.py +0 -7
- wandb/integration/yolov8/yolov8.py +22 -3
- wandb/old/settings.py +7 -0
- wandb/plot/line_series.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +353 -300
- wandb/proto/v3/wandb_server_pb2.py +37 -41
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
- wandb/proto/v4/wandb_internal_pb2.py +272 -260
- wandb/proto/v4/wandb_server_pb2.py +37 -40
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
- wandb/proto/wandb_internal_codegen.py +7 -31
- wandb/sdk/artifacts/artifact.py +321 -189
- wandb/sdk/artifacts/artifact_cache.py +14 -0
- wandb/sdk/artifacts/artifact_manifest.py +5 -4
- wandb/sdk/artifacts/artifact_manifest_entry.py +37 -9
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -9
- wandb/sdk/artifacts/artifact_saver.py +13 -50
- wandb/sdk/artifacts/artifact_ttl.py +6 -0
- wandb/sdk/artifacts/artifacts_cache.py +119 -93
- wandb/sdk/artifacts/staging.py +25 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -3
- wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
- wandb/sdk/artifacts/storage_policies/register.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +4 -3
- wandb/sdk/artifacts/storage_policy.py +4 -2
- wandb/sdk/backend/backend.py +0 -16
- wandb/sdk/data_types/image.py +3 -1
- wandb/sdk/integration_utils/auto_logging.py +38 -13
- wandb/sdk/interface/interface.py +16 -135
- wandb/sdk/interface/interface_shared.py +9 -147
- wandb/sdk/interface/interface_sock.py +0 -26
- wandb/sdk/internal/file_pusher.py +20 -3
- wandb/sdk/internal/file_stream.py +3 -1
- wandb/sdk/internal/handler.py +53 -70
- wandb/sdk/internal/internal_api.py +220 -130
- wandb/sdk/internal/job_builder.py +41 -37
- wandb/sdk/internal/sender.py +7 -25
- wandb/sdk/internal/system/assets/disk.py +144 -11
- wandb/sdk/internal/system/system_info.py +6 -2
- wandb/sdk/launch/__init__.py +5 -0
- wandb/sdk/launch/{launch.py → _launch.py} +53 -54
- wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
- wandb/sdk/launch/_project_spec.py +13 -2
- wandb/sdk/launch/agent/agent.py +103 -59
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
- wandb/sdk/launch/builder/build.py +19 -1
- wandb/sdk/launch/builder/docker_builder.py +5 -1
- wandb/sdk/launch/builder/kaniko_builder.py +5 -1
- wandb/sdk/launch/create_job.py +20 -5
- wandb/sdk/launch/loader.py +14 -5
- wandb/sdk/launch/runner/abstract.py +0 -2
- wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +66 -209
- wandb/sdk/launch/runner/local_container.py +5 -2
- wandb/sdk/launch/runner/local_process.py +4 -1
- wandb/sdk/launch/sweeps/scheduler.py +43 -25
- wandb/sdk/launch/sweeps/utils.py +5 -3
- wandb/sdk/launch/utils.py +3 -1
- wandb/sdk/lib/_settings_toposort_generate.py +3 -9
- wandb/sdk/lib/_settings_toposort_generated.py +27 -3
- wandb/sdk/lib/_wburls_generated.py +1 -0
- wandb/sdk/lib/filenames.py +27 -6
- wandb/sdk/lib/filesystem.py +181 -7
- wandb/sdk/lib/fsm.py +5 -3
- wandb/sdk/lib/gql_request.py +3 -0
- wandb/sdk/lib/ipython.py +7 -0
- wandb/sdk/lib/wburls.py +1 -0
- wandb/sdk/service/port_file.py +2 -15
- wandb/sdk/service/server.py +7 -55
- wandb/sdk/service/service.py +56 -26
- wandb/sdk/service/service_base.py +1 -1
- wandb/sdk/service/streams.py +11 -5
- wandb/sdk/verify/verify.py +2 -2
- wandb/sdk/wandb_init.py +8 -2
- wandb/sdk/wandb_manager.py +4 -14
- wandb/sdk/wandb_run.py +143 -53
- wandb/sdk/wandb_settings.py +148 -35
- wandb/testing/relay.py +85 -38
- wandb/util.py +87 -4
- wandb/wandb_torch.py +24 -38
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/METADATA +48 -23
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/RECORD +107 -103
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/WHEEL +1 -1
- wandb/proto/v3/wandb_server_pb2_grpc.py +0 -1422
- wandb/proto/v4/wandb_server_pb2_grpc.py +0 -1422
- wandb/proto/wandb_server_pb2_grpc.py +0 -8
- wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +0 -61
- wandb/sdk/interface/interface_grpc.py +0 -460
- wandb/sdk/service/server_grpc.py +0 -444
- wandb/sdk/service/service_grpc.py +0 -73
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
- {wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0
@@ -87,6 +87,7 @@ class JobBuilder:
|
|
87
87
|
_aliases: List[str]
|
88
88
|
_job_seq_id: Optional[str]
|
89
89
|
_job_version_alias: Optional[str]
|
90
|
+
_is_notebook_run: bool
|
90
91
|
|
91
92
|
def __init__(self, settings: SettingsStatic):
|
92
93
|
self._settings = settings
|
@@ -103,6 +104,7 @@ class JobBuilder:
|
|
103
104
|
self._source_type: Optional[
|
104
105
|
Literal["repo", "artifact", "image"]
|
105
106
|
] = settings.job_source # type: ignore[assignment]
|
107
|
+
self._is_notebook_run = self._get_is_notebook_run()
|
106
108
|
|
107
109
|
def set_config(self, config: Dict[str, Any]) -> None:
|
108
110
|
self._config = config
|
@@ -153,7 +155,7 @@ class JobBuilder:
|
|
153
155
|
commit = git_info.get("commit")
|
154
156
|
assert remote is not None
|
155
157
|
assert commit is not None
|
156
|
-
if self._is_notebook_run
|
158
|
+
if self._is_notebook_run:
|
157
159
|
if not os.path.exists(
|
158
160
|
os.path.join(os.getcwd(), os.path.basename(program_relpath))
|
159
161
|
):
|
@@ -194,7 +196,7 @@ class JobBuilder:
|
|
194
196
|
os.path.basename(sys.executable),
|
195
197
|
full_program_path,
|
196
198
|
],
|
197
|
-
"notebook": self._is_notebook_run
|
199
|
+
"notebook": self._is_notebook_run,
|
198
200
|
}
|
199
201
|
|
200
202
|
if self._settings.job_name:
|
@@ -219,18 +221,16 @@ class JobBuilder:
|
|
219
221
|
) -> Tuple[Optional[ArtifactSourceDict], Optional[str]]:
|
220
222
|
assert isinstance(self._logged_code_artifact, dict)
|
221
223
|
# TODO: should we just always exit early if the path doesn't exist?
|
222
|
-
if self._is_notebook_run
|
224
|
+
if self._is_notebook_run and not self._is_colab_run():
|
223
225
|
full_program_relpath = os.path.relpath(program_relpath, os.getcwd())
|
224
226
|
# if the resolved path doesn't exist, then we shouldn't make a job because it will fail
|
225
227
|
if not os.path.exists(full_program_relpath):
|
226
228
|
# when users call log code in a notebook the code artifact starts
|
227
|
-
# at the directory the notebook is in instead of the jupyter
|
228
|
-
|
229
|
-
if os.path.exists(os.path.basename(program_relpath)):
|
230
|
-
full_program_relpath = os.path.basename(program_relpath)
|
231
|
-
else:
|
229
|
+
# at the directory the notebook is in instead of the jupyter core
|
230
|
+
if not os.path.exists(os.path.basename(program_relpath)):
|
232
231
|
_logger.info("target path does not exist, exiting")
|
233
232
|
return None, None
|
233
|
+
full_program_relpath = os.path.basename(program_relpath)
|
234
234
|
else:
|
235
235
|
full_program_relpath = program_relpath
|
236
236
|
entrypoint = [
|
@@ -240,7 +240,7 @@ class JobBuilder:
|
|
240
240
|
# TODO: update executable to a method that supports pex
|
241
241
|
source: ArtifactSourceDict = {
|
242
242
|
"entrypoint": entrypoint,
|
243
|
-
"notebook": self._is_notebook_run
|
243
|
+
"notebook": self._is_notebook_run,
|
244
244
|
"artifact": f"wandb-artifact://_id/{self._logged_code_artifact['id']}",
|
245
245
|
}
|
246
246
|
|
@@ -271,7 +271,7 @@ class JobBuilder:
|
|
271
271
|
}
|
272
272
|
return source, name
|
273
273
|
|
274
|
-
def
|
274
|
+
def _get_is_notebook_run(self) -> bool:
|
275
275
|
return hasattr(self._settings, "_jupyter") and bool(self._settings._jupyter)
|
276
276
|
|
277
277
|
def _is_colab_run(self) -> bool:
|
@@ -288,15 +288,10 @@ class JobBuilder:
|
|
288
288
|
return None
|
289
289
|
|
290
290
|
runtime: Optional[str] = metadata.get("python")
|
291
|
-
program_relpath: Optional[str] = metadata.get("codePath")
|
292
291
|
# can't build a job without a python version
|
293
292
|
if runtime is None:
|
294
293
|
return None
|
295
294
|
|
296
|
-
if self._is_notebook_run():
|
297
|
-
_logger.info("run is notebook based run")
|
298
|
-
program_relpath = metadata.get("program")
|
299
|
-
|
300
295
|
input_types = TypeRegistry.type_of(self._config).to_json()
|
301
296
|
output_types = TypeRegistry.type_of(self._summary).to_json()
|
302
297
|
|
@@ -315,10 +310,14 @@ class JobBuilder:
|
|
315
310
|
source_type = source_info.get("source_type")
|
316
311
|
else:
|
317
312
|
# configure job from environment
|
318
|
-
source_type = self._get_source_type(metadata
|
313
|
+
source_type = self._get_source_type(metadata)
|
319
314
|
if not source_type:
|
320
315
|
return None
|
321
316
|
|
317
|
+
program_relpath = self._get_program_relpath(source_type, metadata)
|
318
|
+
if source_type != "image" and not program_relpath:
|
319
|
+
return None
|
320
|
+
|
322
321
|
source: Union[
|
323
322
|
Optional[GitSourceDict],
|
324
323
|
Optional[ArtifactSourceDict],
|
@@ -326,19 +325,15 @@ class JobBuilder:
|
|
326
325
|
] = None
|
327
326
|
|
328
327
|
# make source dict
|
329
|
-
if source_type == "repo"
|
330
|
-
|
331
|
-
):
|
332
|
-
assert program_relpath is not None
|
328
|
+
if source_type == "repo":
|
329
|
+
assert program_relpath
|
333
330
|
source, name = self._build_repo_job_source(
|
334
331
|
metadata,
|
335
332
|
program_relpath,
|
336
333
|
metadata.get("root"),
|
337
334
|
)
|
338
|
-
elif source_type == "artifact"
|
339
|
-
program_relpath
|
340
|
-
):
|
341
|
-
assert program_relpath is not None
|
335
|
+
elif source_type == "artifact":
|
336
|
+
assert program_relpath
|
342
337
|
source, name = self._build_artifact_job_source(program_relpath)
|
343
338
|
elif source_type == "image" and self._has_image_job_ingredients(metadata):
|
344
339
|
source, name = self._build_image_job_source(metadata)
|
@@ -390,17 +385,15 @@ class JobBuilder:
|
|
390
385
|
|
391
386
|
return artifact
|
392
387
|
|
393
|
-
def _get_source_type(
|
394
|
-
self, metadata: Dict[str, Any], relpath: Optional[str]
|
395
|
-
) -> Optional[str]:
|
388
|
+
def _get_source_type(self, metadata: Dict[str, Any]) -> Optional[str]:
|
396
389
|
if self._source_type:
|
397
390
|
return self._source_type
|
398
391
|
|
399
|
-
if self._has_git_job_ingredients(metadata
|
392
|
+
if self._has_git_job_ingredients(metadata):
|
400
393
|
_logger.info("is repo sourced job")
|
401
394
|
return "repo"
|
402
395
|
|
403
|
-
if self._has_artifact_job_ingredients(
|
396
|
+
if self._has_artifact_job_ingredients():
|
404
397
|
_logger.info("is artifact sourced job")
|
405
398
|
return "artifact"
|
406
399
|
|
@@ -411,6 +404,21 @@ class JobBuilder:
|
|
411
404
|
_logger.info("no source found")
|
412
405
|
return None
|
413
406
|
|
407
|
+
def _get_program_relpath(
|
408
|
+
self, source_type: str, metadata: Dict[str, Any]
|
409
|
+
) -> Optional[str]:
|
410
|
+
if self._is_notebook_run:
|
411
|
+
_logger.info("run is notebook based run")
|
412
|
+
return metadata.get("program")
|
413
|
+
|
414
|
+
if source_type == "artifact" or self._settings.job_source == "artifact":
|
415
|
+
# if the job is set to be an artifact, use relpath guaranteed
|
416
|
+
# to be correct. 'codePath' uses the root path when in git repo
|
417
|
+
# fallback to codePath if strictly local relpath not present
|
418
|
+
return metadata.get("codePathLocal") or metadata.get("codePath")
|
419
|
+
|
420
|
+
return metadata.get("codePath")
|
421
|
+
|
414
422
|
def _handle_metadata_file(
|
415
423
|
self,
|
416
424
|
) -> Optional[Dict]:
|
@@ -421,18 +429,14 @@ class JobBuilder:
|
|
421
429
|
|
422
430
|
return None
|
423
431
|
|
424
|
-
def _has_git_job_ingredients(
|
425
|
-
self, metadata: Dict[str, Any], program_relpath: Optional[str]
|
426
|
-
) -> bool:
|
432
|
+
def _has_git_job_ingredients(self, metadata: Dict[str, Any]) -> bool:
|
427
433
|
git_info: Dict[str, str] = metadata.get("git", {})
|
428
|
-
if
|
429
|
-
return False
|
430
|
-
if self._is_notebook_run() and metadata.get("root") is None:
|
434
|
+
if self._is_notebook_run and metadata.get("root") is None:
|
431
435
|
return False
|
432
436
|
return git_info.get("remote") is not None and git_info.get("commit") is not None
|
433
437
|
|
434
|
-
def _has_artifact_job_ingredients(self
|
435
|
-
return self._logged_code_artifact is not None
|
438
|
+
def _has_artifact_job_ingredients(self) -> bool:
|
439
|
+
return self._logged_code_artifact is not None
|
436
440
|
|
437
441
|
def _has_image_job_ingredients(self, metadata: Dict[str, Any]) -> bool:
|
438
442
|
return metadata.get("docker") is not None
|
wandb/sdk/internal/sender.py
CHANGED
@@ -33,7 +33,7 @@ from wandb.errors import CommError, UsageError
|
|
33
33
|
from wandb.errors.util import ProtobufErrorHandler
|
34
34
|
from wandb.filesync.dir_watcher import DirWatcher
|
35
35
|
from wandb.proto import wandb_internal_pb2
|
36
|
-
from wandb.sdk.artifacts import
|
36
|
+
from wandb.sdk.artifacts.artifact_saver import ArtifactSaver
|
37
37
|
from wandb.sdk.interface import interface
|
38
38
|
from wandb.sdk.interface.interface_queue import InterfaceQueue
|
39
39
|
from wandb.sdk.internal import (
|
@@ -268,6 +268,7 @@ class SendManager:
|
|
268
268
|
self._cached_summary: Dict[str, Any] = dict()
|
269
269
|
self._config_metric_index_dict: Dict[str, int] = {}
|
270
270
|
self._config_metric_dict: Dict[str, wandb_internal_pb2.MetricRecord] = {}
|
271
|
+
self._consolidated_summary: Dict[str, Any] = dict()
|
271
272
|
|
272
273
|
self._cached_server_info = dict()
|
273
274
|
self._cached_viewer = dict()
|
@@ -1149,7 +1150,9 @@ class SendManager:
|
|
1149
1150
|
summary_dict.pop("_wandb", None)
|
1150
1151
|
if self._metadata_summary:
|
1151
1152
|
summary_dict["_wandb"] = self._metadata_summary
|
1152
|
-
|
1153
|
+
# merge with consolidated summary
|
1154
|
+
self._consolidated_summary.update(summary_dict)
|
1155
|
+
json_summary = json.dumps(self._consolidated_summary)
|
1153
1156
|
if self._fs:
|
1154
1157
|
self._fs.push(filenames.SUMMARY_FNAME, json_summary)
|
1155
1158
|
# TODO(jhr): we should only write this at the end of the script
|
@@ -1446,28 +1449,6 @@ class SendManager:
|
|
1446
1449
|
|
1447
1450
|
self._respond_result(result)
|
1448
1451
|
|
1449
|
-
def send_request_artifact_send(self, record: "Record") -> None:
|
1450
|
-
# TODO: combine and eventually remove send_request_log_artifact()
|
1451
|
-
|
1452
|
-
# for now, we are using req/resp uuid for transaction id
|
1453
|
-
# in the future this should be part of the message to handle idempotency
|
1454
|
-
xid = record.uuid
|
1455
|
-
|
1456
|
-
done_msg = wandb_internal_pb2.ArtifactDoneRequest(xid=xid)
|
1457
|
-
artifact = record.request.artifact_send.artifact
|
1458
|
-
try:
|
1459
|
-
res = self._send_artifact(artifact)
|
1460
|
-
assert res, "Unable to send artifact"
|
1461
|
-
done_msg.artifact_id = res["id"]
|
1462
|
-
logger.info(f"logged artifact {artifact.name} - {res}")
|
1463
|
-
except Exception as e:
|
1464
|
-
done_msg.error_message = 'error logging artifact "{}/{}": {}'.format(
|
1465
|
-
artifact.type, artifact.name, e
|
1466
|
-
)
|
1467
|
-
|
1468
|
-
logger.info("send artifact done")
|
1469
|
-
self._interface._publish_artifact_done(done_msg)
|
1470
|
-
|
1471
1452
|
def send_artifact(self, record: "Record") -> None:
|
1472
1453
|
artifact = record.artifact
|
1473
1454
|
try:
|
@@ -1486,7 +1467,7 @@ class SendManager:
|
|
1486
1467
|
from pkg_resources import parse_version
|
1487
1468
|
|
1488
1469
|
assert self._pusher
|
1489
|
-
saver =
|
1470
|
+
saver = ArtifactSaver(
|
1490
1471
|
api=self._api,
|
1491
1472
|
digest=artifact.digest,
|
1492
1473
|
manifest_json=_manifest_json_from_proto(artifact.manifest),
|
@@ -1512,6 +1493,7 @@ class SendManager:
|
|
1512
1493
|
client_id=artifact.client_id,
|
1513
1494
|
sequence_client_id=artifact.sequence_client_id,
|
1514
1495
|
metadata=metadata,
|
1496
|
+
ttl_duration_seconds=artifact.ttl_duration_seconds or None,
|
1515
1497
|
description=artifact.description or None,
|
1516
1498
|
aliases=artifact.aliases,
|
1517
1499
|
use_after_commit=artifact.use_after_commit,
|
@@ -1,12 +1,14 @@
|
|
1
1
|
import threading
|
2
2
|
from collections import deque
|
3
|
-
from typing import TYPE_CHECKING, List
|
3
|
+
from typing import TYPE_CHECKING, List, Optional
|
4
4
|
|
5
5
|
try:
|
6
6
|
import psutil
|
7
7
|
except ImportError:
|
8
8
|
psutil = None
|
9
9
|
|
10
|
+
from wandb.errors.term import termwarn
|
11
|
+
|
10
12
|
from .aggregators import aggregate_mean
|
11
13
|
from .asset_registry import asset_registry
|
12
14
|
from .interfaces import Interface, Metric, MetricsMonitor
|
@@ -17,18 +19,132 @@ if TYPE_CHECKING:
|
|
17
19
|
from wandb.sdk.internal.settings_static import SettingsStatic
|
18
20
|
|
19
21
|
|
20
|
-
class
|
22
|
+
class DiskUsagePercent:
|
21
23
|
"""Total system disk usage in percent."""
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
+
name = "disk.{path}.usagePercent"
|
26
|
+
samples: "Deque[List[float]]"
|
27
|
+
|
28
|
+
def __init__(self, paths: List[str]) -> None:
|
29
|
+
self.samples = deque([])
|
30
|
+
# check if we have access to the disk paths:
|
31
|
+
self.paths: List[str] = []
|
32
|
+
for path in paths:
|
33
|
+
try:
|
34
|
+
psutil.disk_usage(path)
|
35
|
+
self.paths.append(path)
|
36
|
+
except Exception as e: # noqa
|
37
|
+
termwarn(f"Could not access disk path {path}: {e}", repeat=False)
|
38
|
+
|
39
|
+
def sample(self) -> None:
|
40
|
+
# self.samples.append(psutil.disk_usage("/").percent)
|
41
|
+
disk_usage: List[float] = []
|
42
|
+
for path in self.paths:
|
43
|
+
disk_usage.append(psutil.disk_usage(path).percent)
|
44
|
+
if disk_usage:
|
45
|
+
self.samples.append(disk_usage)
|
46
|
+
|
47
|
+
def clear(self) -> None:
|
48
|
+
self.samples.clear()
|
49
|
+
|
50
|
+
def aggregate(self) -> dict:
|
51
|
+
if not self.samples:
|
52
|
+
return {}
|
53
|
+
disk_metrics = {}
|
54
|
+
for i, _path in enumerate(self.paths):
|
55
|
+
aggregate_i = aggregate_mean([sample[i] for sample in self.samples])
|
56
|
+
# ugly hack to please the frontend:
|
57
|
+
_path = _path.replace("/", "\\")
|
58
|
+
disk_metrics[self.name.format(path=_path)] = aggregate_i
|
59
|
+
|
60
|
+
return disk_metrics
|
61
|
+
|
62
|
+
|
63
|
+
class DiskUsage:
|
64
|
+
"""Total system disk usage in GB."""
|
65
|
+
|
66
|
+
name = "disk.{path}.usageGB"
|
67
|
+
samples: "Deque[List[float]]"
|
68
|
+
|
69
|
+
def __init__(self, paths: List[str]) -> None:
|
70
|
+
self.samples = deque([])
|
71
|
+
# check if we have access to the disk paths:
|
72
|
+
self.paths: List[str] = []
|
73
|
+
for path in paths:
|
74
|
+
try:
|
75
|
+
psutil.disk_usage(path)
|
76
|
+
self.paths.append(path)
|
77
|
+
except Exception as e: # noqa
|
78
|
+
termwarn(f"Could not access disk path {path}: {e}", repeat=False)
|
79
|
+
|
80
|
+
def sample(self) -> None:
|
81
|
+
disk_usage: List[float] = []
|
82
|
+
for path in self.paths:
|
83
|
+
disk_usage.append(psutil.disk_usage(path).used / 1024 / 1024 / 1024)
|
84
|
+
if disk_usage:
|
85
|
+
self.samples.append(disk_usage)
|
86
|
+
|
87
|
+
def clear(self) -> None:
|
88
|
+
self.samples.clear()
|
89
|
+
|
90
|
+
def aggregate(self) -> dict:
|
91
|
+
if not self.samples:
|
92
|
+
return {}
|
93
|
+
disk_metrics = {}
|
94
|
+
for i, _path in enumerate(self.paths):
|
95
|
+
aggregate_i = aggregate_mean([sample[i] for sample in self.samples])
|
96
|
+
# ugly hack to please the frontend:
|
97
|
+
_path = _path.replace("/", "\\")
|
98
|
+
disk_metrics[self.name.format(path=_path)] = aggregate_i
|
99
|
+
|
100
|
+
return disk_metrics
|
101
|
+
|
102
|
+
|
103
|
+
class DiskIn:
|
104
|
+
"""Total system disk read in MB."""
|
105
|
+
|
106
|
+
name = "disk.in"
|
107
|
+
samples: "Deque[float]"
|
108
|
+
|
109
|
+
def __init__(self) -> None:
|
110
|
+
self.samples = deque([])
|
111
|
+
self.read_init: Optional[int] = None
|
112
|
+
|
113
|
+
def sample(self) -> None:
|
114
|
+
if self.read_init is None:
|
115
|
+
# initialize the read_init value on first sample
|
116
|
+
self.read_init = psutil.disk_io_counters().read_bytes
|
117
|
+
self.samples.append(
|
118
|
+
(psutil.disk_io_counters().read_bytes - self.read_init) / 1024 / 1024
|
119
|
+
)
|
120
|
+
|
121
|
+
def clear(self) -> None:
|
122
|
+
self.samples.clear()
|
123
|
+
|
124
|
+
def aggregate(self) -> dict:
|
125
|
+
if not self.samples:
|
126
|
+
return {}
|
127
|
+
aggregate = aggregate_mean(self.samples)
|
128
|
+
return {self.name: aggregate}
|
129
|
+
|
130
|
+
|
131
|
+
class DiskOut:
|
132
|
+
"""Total system disk write in MB."""
|
133
|
+
|
134
|
+
name = "disk.out"
|
25
135
|
samples: "Deque[float]"
|
26
136
|
|
27
137
|
def __init__(self) -> None:
|
28
138
|
self.samples = deque([])
|
139
|
+
self.write_init: Optional[int] = None
|
29
140
|
|
30
141
|
def sample(self) -> None:
|
31
|
-
self.
|
142
|
+
if self.write_init is None:
|
143
|
+
# init on first sample
|
144
|
+
self.write_init = psutil.disk_io_counters().write_bytes
|
145
|
+
self.samples.append(
|
146
|
+
(psutil.disk_io_counters().write_bytes - self.write_init) / 1024 / 1024
|
147
|
+
)
|
32
148
|
|
33
149
|
def clear(self) -> None:
|
34
150
|
self.samples.clear()
|
@@ -49,7 +165,13 @@ class Disk:
|
|
49
165
|
shutdown_event: threading.Event,
|
50
166
|
) -> None:
|
51
167
|
self.name = self.__class__.__name__.lower()
|
52
|
-
self.
|
168
|
+
self.settings = settings
|
169
|
+
self.metrics: List[Metric] = [
|
170
|
+
DiskUsagePercent(list(settings._stats_disk_paths or ["/"])),
|
171
|
+
DiskUsage(list(settings._stats_disk_paths or ["/"])),
|
172
|
+
DiskIn(),
|
173
|
+
DiskOut(),
|
174
|
+
]
|
53
175
|
self.metrics_monitor = MetricsMonitor(
|
54
176
|
self.name,
|
55
177
|
self.metrics,
|
@@ -64,11 +186,22 @@ class Disk:
|
|
64
186
|
return psutil is not None
|
65
187
|
|
66
188
|
def probe(self) -> dict:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
189
|
+
disk_paths = list(self.settings._stats_disk_paths or ["/"])
|
190
|
+
disk_metrics = {}
|
191
|
+
for disk_path in disk_paths:
|
192
|
+
try:
|
193
|
+
# total disk space in GB:
|
194
|
+
total = psutil.disk_usage(disk_path).total / 1024 / 1024 / 1024
|
195
|
+
# total disk space used in GB:
|
196
|
+
used = psutil.disk_usage(disk_path).used / 1024 / 1024 / 1024
|
197
|
+
disk_metrics[disk_path] = {
|
198
|
+
"total": total,
|
199
|
+
"used": used,
|
200
|
+
}
|
201
|
+
except Exception as e: # noqa
|
202
|
+
termwarn(f"Could not access disk path {disk_path}: {e}", repeat=False)
|
203
|
+
|
204
|
+
return {self.name: disk_metrics}
|
72
205
|
|
73
206
|
def start(self) -> None:
|
74
207
|
self.metrics_monitor.start()
|
@@ -19,6 +19,7 @@ from wandb.sdk.lib.filenames import (
|
|
19
19
|
REQUIREMENTS_FNAME,
|
20
20
|
)
|
21
21
|
from wandb.sdk.lib.gitlib import GitRepo
|
22
|
+
from wandb.sdk.wandb_settings import _get_program_relpath
|
22
23
|
|
23
24
|
from .assets.interfaces import Interface
|
24
25
|
|
@@ -87,7 +88,7 @@ class SystemInfo:
|
|
87
88
|
|
88
89
|
def _save_code(self) -> None:
|
89
90
|
logger.debug("Saving code")
|
90
|
-
if self.settings.program_relpath
|
91
|
+
if not self.settings.program_relpath:
|
91
92
|
logger.warning("unable to save code -- program entry not found")
|
92
93
|
return None
|
93
94
|
|
@@ -210,8 +211,11 @@ class SystemInfo:
|
|
210
211
|
|
211
212
|
if self.settings.program is not None:
|
212
213
|
data["program"] = self.settings.program
|
214
|
+
# Used during artifact-job creation, always points to the relpath
|
215
|
+
# of code execution, even when in a git repo
|
216
|
+
data["codePathLocal"] = _get_program_relpath(self.settings.program)
|
213
217
|
if not self.settings.disable_code:
|
214
|
-
if self.settings.program_relpath
|
218
|
+
if self.settings.program_relpath:
|
215
219
|
data["codePath"] = self.settings.program_relpath
|
216
220
|
elif self.settings._jupyter:
|
217
221
|
if self.settings.notebook_name:
|
wandb/sdk/launch/__init__.py
CHANGED
@@ -124,24 +124,28 @@ def create_and_run_agent(
|
|
124
124
|
agent.loop()
|
125
125
|
|
126
126
|
|
127
|
-
def
|
128
|
-
uri: Optional[str],
|
129
|
-
job: Optional[str],
|
130
|
-
name: Optional[str],
|
131
|
-
project: Optional[str],
|
132
|
-
entity: Optional[str],
|
133
|
-
docker_image: Optional[str],
|
134
|
-
entry_point: Optional[List[str]],
|
135
|
-
version: Optional[str],
|
136
|
-
resource: str,
|
137
|
-
resource_args: Optional[Dict[str, Any]],
|
138
|
-
launch_config: Optional[Dict[str, Any]],
|
139
|
-
synchronous: Optional[bool],
|
127
|
+
def _launch(
|
140
128
|
api: Api,
|
141
|
-
|
142
|
-
|
129
|
+
uri: Optional[str] = None,
|
130
|
+
job: Optional[str] = None,
|
131
|
+
name: Optional[str] = None,
|
132
|
+
project: Optional[str] = None,
|
133
|
+
entity: Optional[str] = None,
|
134
|
+
docker_image: Optional[str] = None,
|
135
|
+
entry_point: Optional[List[str]] = None,
|
136
|
+
version: Optional[str] = None,
|
137
|
+
resource: Optional[str] = None,
|
138
|
+
resource_args: Optional[Dict[str, Any]] = None,
|
139
|
+
launch_config: Optional[Dict[str, Any]] = None,
|
140
|
+
synchronous: Optional[bool] = None,
|
141
|
+
run_id: Optional[str] = None,
|
142
|
+
repository: Optional[str] = None,
|
143
143
|
) -> AbstractRun:
|
144
144
|
"""Helper that delegates to the project-running method corresponding to the passed-in backend."""
|
145
|
+
if launch_config is None:
|
146
|
+
launch_config = {}
|
147
|
+
if resource is None:
|
148
|
+
resource = "local-container"
|
145
149
|
launch_spec = construct_launch_spec(
|
146
150
|
uri,
|
147
151
|
job,
|
@@ -193,9 +197,8 @@ def _run(
|
|
193
197
|
)
|
194
198
|
|
195
199
|
|
196
|
-
def
|
200
|
+
def launch(
|
197
201
|
api: Api,
|
198
|
-
uri: Optional[str] = None,
|
199
202
|
job: Optional[str] = None,
|
200
203
|
entry_point: Optional[List[str]] = None,
|
201
204
|
version: Optional[str] = None,
|
@@ -210,41 +213,43 @@ def run(
|
|
210
213
|
run_id: Optional[str] = None,
|
211
214
|
repository: Optional[str] = None,
|
212
215
|
) -> AbstractRun:
|
213
|
-
"""
|
216
|
+
"""Launch a W&B launch experiment.
|
214
217
|
|
215
218
|
Arguments:
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
repository: string name of repository path for remote registry
|
219
|
+
job: string reference to a wandb.Job eg: wandb/test/my-job:latest
|
220
|
+
api: An instance of a wandb Api from wandb.apis.internal.
|
221
|
+
entry_point: Entry point to run within the project. Defaults to using the entry point used
|
222
|
+
in the original run for wandb URIs, or main.py for git repository URIs.
|
223
|
+
version: For Git-based projects, either a commit hash or a branch name.
|
224
|
+
name: Name run under which to launch the run.
|
225
|
+
resource: Execution backend for the run.
|
226
|
+
resource_args: Resource related arguments for launching runs onto a remote backend.
|
227
|
+
Will be stored on the constructed launch config under ``resource_args``.
|
228
|
+
project: Target project to send launched run to
|
229
|
+
entity: Target entity to send launched run to
|
230
|
+
config: A dictionary containing the configuration for the run. May also contain
|
231
|
+
resource specific arguments under the key "resource_args".
|
232
|
+
synchronous: Whether to block while waiting for a run to complete. Defaults to True.
|
233
|
+
Note that if ``synchronous`` is False and ``backend`` is "local-container", this
|
234
|
+
method will return, but the current process will block when exiting until
|
235
|
+
the local run completes. If the current process is interrupted, any
|
236
|
+
asynchronous runs launched via this method will be terminated. If
|
237
|
+
``synchronous`` is True and the run fails, the current process will
|
238
|
+
error out as well.
|
239
|
+
run_id: ID for the run (To ultimately replace the :name: field)
|
240
|
+
repository: string name of repository path for remote registry
|
239
241
|
|
240
242
|
Example:
|
241
|
-
|
242
|
-
|
243
|
-
|
243
|
+
```python
|
244
|
+
from wandb.sdk.launch import launch
|
245
|
+
|
246
|
+
job = "wandb/jobs/Hello World:latest"
|
247
|
+
params = {"epochs": 5}
|
244
248
|
# Run W&B project and create a reproducible docker environment
|
245
249
|
# on a local host
|
246
250
|
api = wandb.apis.internal.Api()
|
247
|
-
|
251
|
+
launch(api, job, parameters=params)
|
252
|
+
```
|
248
253
|
|
249
254
|
|
250
255
|
Returns:
|
@@ -255,15 +260,9 @@ def run(
|
|
255
260
|
`wandb.exceptions.ExecutionError` If a run launched in blocking mode
|
256
261
|
is unsuccessful.
|
257
262
|
"""
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
# default to local container for runs without a queue
|
262
|
-
if resource is None:
|
263
|
-
resource = "local-container"
|
264
|
-
|
265
|
-
submitted_run_obj = _run(
|
266
|
-
uri=uri,
|
263
|
+
submitted_run_obj = _launch(
|
264
|
+
# TODO: fully deprecate URI path
|
265
|
+
uri=None,
|
267
266
|
job=job,
|
268
267
|
name=name,
|
269
268
|
project=project,
|