wandb 0.21.4__py3-none-musllinux_1_2_aarch64.whl → 0.22.0__py3-none-musllinux_1_2_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +1 -1
- wandb/__init__.pyi +1 -1
- wandb/apis/public/__init__.py +42 -0
- wandb/apis/public/runs.py +24 -6
- wandb/bin/gpu_stats +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/proto/v3/wandb_internal_pb2.py +234 -224
- wandb/proto/v4/wandb_internal_pb2.py +226 -224
- wandb/proto/v5/wandb_internal_pb2.py +226 -224
- wandb/proto/v6/wandb_base_pb2.py +3 -3
- wandb/proto/v6/wandb_internal_pb2.py +229 -227
- wandb/proto/v6/wandb_server_pb2.py +3 -3
- wandb/proto/v6/wandb_settings_pb2.py +3 -3
- wandb/proto/v6/wandb_sync_pb2.py +3 -3
- wandb/proto/v6/wandb_telemetry_pb2.py +3 -3
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +1 -1
- wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -3
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -1
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +1 -1
- wandb/sdk/artifacts/storage_policies/_factories.py +63 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +59 -124
- wandb/sdk/interface/interface.py +10 -0
- wandb/sdk/interface/interface_shared.py +9 -0
- wandb/sdk/wandb_init.py +9 -1
- wandb/wandb_agent.py +35 -4
- {wandb-0.21.4.dist-info → wandb-0.22.0.dist-info}/METADATA +1 -1
- {wandb-0.21.4.dist-info → wandb-0.22.0.dist-info}/RECORD +811 -810
- {wandb-0.21.4.dist-info → wandb-0.22.0.dist-info}/WHEEL +0 -0
- {wandb-0.21.4.dist-info → wandb-0.22.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.21.4.dist-info → wandb-0.22.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
4
4
|
# source: wandb/proto/wandb_server.proto
|
5
|
-
# Protobuf Python Version: 6.
|
5
|
+
# Protobuf Python Version: 6.31.1
|
6
6
|
"""Generated protocol buffer code."""
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
@@ -12,8 +12,8 @@ from google.protobuf.internal import builder as _builder
|
|
12
12
|
_runtime_version.ValidateProtobufRuntimeVersion(
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
14
14
|
6,
|
15
|
-
|
16
|
-
|
15
|
+
31,
|
16
|
+
1,
|
17
17
|
'',
|
18
18
|
'wandb/proto/wandb_server.proto'
|
19
19
|
)
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
4
4
|
# source: wandb/proto/wandb_settings.proto
|
5
|
-
# Protobuf Python Version: 6.
|
5
|
+
# Protobuf Python Version: 6.31.1
|
6
6
|
"""Generated protocol buffer code."""
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
@@ -12,8 +12,8 @@ from google.protobuf.internal import builder as _builder
|
|
12
12
|
_runtime_version.ValidateProtobufRuntimeVersion(
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
14
14
|
6,
|
15
|
-
|
16
|
-
|
15
|
+
31,
|
16
|
+
1,
|
17
17
|
'',
|
18
18
|
'wandb/proto/wandb_settings.proto'
|
19
19
|
)
|
wandb/proto/v6/wandb_sync_pb2.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
4
4
|
# source: wandb/proto/wandb_sync.proto
|
5
|
-
# Protobuf Python Version: 6.
|
5
|
+
# Protobuf Python Version: 6.31.1
|
6
6
|
"""Generated protocol buffer code."""
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
@@ -12,8 +12,8 @@ from google.protobuf.internal import builder as _builder
|
|
12
12
|
_runtime_version.ValidateProtobufRuntimeVersion(
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
14
14
|
6,
|
15
|
-
|
16
|
-
|
15
|
+
31,
|
16
|
+
1,
|
17
17
|
'',
|
18
18
|
'wandb/proto/wandb_sync.proto'
|
19
19
|
)
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
4
4
|
# source: wandb/proto/wandb_telemetry.proto
|
5
|
-
# Protobuf Python Version: 6.
|
5
|
+
# Protobuf Python Version: 6.31.1
|
6
6
|
"""Generated protocol buffer code."""
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
@@ -12,8 +12,8 @@ from google.protobuf.internal import builder as _builder
|
|
12
12
|
_runtime_version.ValidateProtobufRuntimeVersion(
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
14
14
|
6,
|
15
|
-
|
16
|
-
|
15
|
+
31,
|
16
|
+
1,
|
17
17
|
'',
|
18
18
|
'wandb/proto/wandb_telemetry.proto'
|
19
19
|
)
|
@@ -65,7 +65,7 @@ class GCSHandler(StorageHandler):
|
|
65
65
|
path, hit, cache_open = self._cache.check_etag_obj_path(
|
66
66
|
url=URIStr(manifest_entry.ref),
|
67
67
|
etag=ETag(manifest_entry.digest),
|
68
|
-
size=manifest_entry.size
|
68
|
+
size=manifest_entry.size or 0,
|
69
69
|
)
|
70
70
|
if hit:
|
71
71
|
return path
|
@@ -43,7 +43,7 @@ class HTTPHandler(StorageHandler):
|
|
43
43
|
path, hit, cache_open = self._cache.check_etag_obj_path(
|
44
44
|
URIStr(manifest_entry.ref),
|
45
45
|
ETag(manifest_entry.digest),
|
46
|
-
manifest_entry.size
|
46
|
+
manifest_entry.size or 0,
|
47
47
|
)
|
48
48
|
if hit:
|
49
49
|
return path
|
@@ -54,7 +54,6 @@ class HTTPHandler(StorageHandler):
|
|
54
54
|
cookies=_thread_local_api_settings.cookies,
|
55
55
|
headers=_thread_local_api_settings.headers,
|
56
56
|
)
|
57
|
-
response.raise_for_status()
|
58
57
|
|
59
58
|
digest: ETag | FilePathStr | URIStr | None
|
60
59
|
digest, size, extra = self._entry_from_headers(response.headers)
|
@@ -87,7 +86,6 @@ class HTTPHandler(StorageHandler):
|
|
87
86
|
cookies=_thread_local_api_settings.cookies,
|
88
87
|
headers=_thread_local_api_settings.headers,
|
89
88
|
) as response:
|
90
|
-
response.raise_for_status()
|
91
89
|
digest: ETag | FilePathStr | URIStr | None
|
92
90
|
digest, size, extra = self._entry_from_headers(response.headers)
|
93
91
|
digest = digest or path
|
@@ -51,7 +51,7 @@ class LocalFileHandler(StorageHandler):
|
|
51
51
|
|
52
52
|
path, hit, cache_open = self._cache.check_md5_obj_path(
|
53
53
|
B64MD5(manifest_entry.digest), # TODO(spencerpearson): unsafe cast
|
54
|
-
manifest_entry.size
|
54
|
+
manifest_entry.size or 0,
|
55
55
|
)
|
56
56
|
if hit:
|
57
57
|
return path
|
@@ -96,7 +96,7 @@ class S3Handler(StorageHandler):
|
|
96
96
|
path, hit, cache_open = self._cache.check_etag_obj_path(
|
97
97
|
URIStr(manifest_entry.ref),
|
98
98
|
ETag(manifest_entry.digest),
|
99
|
-
manifest_entry.size
|
99
|
+
manifest_entry.size or 0,
|
100
100
|
)
|
101
101
|
if hit:
|
102
102
|
return path
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Final
|
4
|
+
|
5
|
+
from requests import Response, Session
|
6
|
+
from requests.adapters import HTTPAdapter
|
7
|
+
from urllib3.util.retry import Retry
|
8
|
+
|
9
|
+
from ..storage_handler import StorageHandler
|
10
|
+
from ..storage_handlers.azure_handler import AzureHandler
|
11
|
+
from ..storage_handlers.gcs_handler import GCSHandler
|
12
|
+
from ..storage_handlers.http_handler import HTTPHandler
|
13
|
+
from ..storage_handlers.local_file_handler import LocalFileHandler
|
14
|
+
from ..storage_handlers.s3_handler import S3Handler
|
15
|
+
from ..storage_handlers.wb_artifact_handler import WBArtifactHandler
|
16
|
+
from ..storage_handlers.wb_local_artifact_handler import WBLocalArtifactHandler
|
17
|
+
|
18
|
+
# Sleep length: 0, 2, 4, 8, 16, 32, 64, 120, 120, 120, 120, 120, 120, 120, 120, 120
|
19
|
+
# seconds, i.e. a total of 20min 6s.
|
20
|
+
HTTP_RETRY_STRATEGY: Final[Retry] = Retry(
|
21
|
+
backoff_factor=1,
|
22
|
+
total=16,
|
23
|
+
status_forcelist=(308, 408, 409, 429, 500, 502, 503, 504),
|
24
|
+
)
|
25
|
+
HTTP_POOL_CONNECTIONS: Final[int] = 64
|
26
|
+
HTTP_POOL_MAXSIZE: Final[int] = 64
|
27
|
+
|
28
|
+
|
29
|
+
def raise_for_status(response: Response, *_, **__) -> None:
|
30
|
+
"""A `requests.Session` hook to raise for status on all requests."""
|
31
|
+
response.raise_for_status()
|
32
|
+
|
33
|
+
|
34
|
+
def make_http_session() -> Session:
|
35
|
+
"""A factory that returns a `requests.Session` for use with artifact storage handlers."""
|
36
|
+
session = Session()
|
37
|
+
|
38
|
+
# Explicitly configure the retry strategy for http/https adapters.
|
39
|
+
adapter = HTTPAdapter(
|
40
|
+
max_retries=HTTP_RETRY_STRATEGY,
|
41
|
+
pool_connections=HTTP_POOL_CONNECTIONS,
|
42
|
+
pool_maxsize=HTTP_POOL_MAXSIZE,
|
43
|
+
)
|
44
|
+
session.mount("http://", adapter)
|
45
|
+
session.mount("https://", adapter)
|
46
|
+
|
47
|
+
# Always raise on HTTP status errors.
|
48
|
+
session.hooks["response"].append(raise_for_status)
|
49
|
+
return session
|
50
|
+
|
51
|
+
|
52
|
+
def make_storage_handlers(session: Session) -> list[StorageHandler]:
|
53
|
+
"""A factory that returns the default artifact storage handlers."""
|
54
|
+
return [
|
55
|
+
S3Handler(), # s3
|
56
|
+
GCSHandler(), # gcs
|
57
|
+
AzureHandler(), # azure
|
58
|
+
HTTPHandler(session, scheme="http"), # http
|
59
|
+
HTTPHandler(session, scheme="https"), # https
|
60
|
+
WBArtifactHandler(), # artifact
|
61
|
+
WBLocalArtifactHandler(), # local_artifact
|
62
|
+
LocalFileHandler(), # file_handler
|
63
|
+
]
|
@@ -16,7 +16,6 @@ from typing import IO, TYPE_CHECKING, Any, NamedTuple, Sequence
|
|
16
16
|
from urllib.parse import quote
|
17
17
|
|
18
18
|
import requests
|
19
|
-
import urllib3
|
20
19
|
|
21
20
|
from wandb import env
|
22
21
|
from wandb.errors.term import termwarn
|
@@ -27,40 +26,24 @@ from wandb.sdk.artifacts.artifact_file_cache import (
|
|
27
26
|
get_artifact_file_cache,
|
28
27
|
)
|
29
28
|
from wandb.sdk.artifacts.staging import get_staging_dir
|
30
|
-
from wandb.sdk.artifacts.storage_handlers.azure_handler import AzureHandler
|
31
|
-
from wandb.sdk.artifacts.storage_handlers.gcs_handler import GCSHandler
|
32
|
-
from wandb.sdk.artifacts.storage_handlers.http_handler import HTTPHandler
|
33
|
-
from wandb.sdk.artifacts.storage_handlers.local_file_handler import LocalFileHandler
|
34
29
|
from wandb.sdk.artifacts.storage_handlers.multi_handler import MultiHandler
|
35
|
-
from wandb.sdk.artifacts.storage_handlers.s3_handler import S3Handler
|
36
30
|
from wandb.sdk.artifacts.storage_handlers.tracking_handler import TrackingHandler
|
37
|
-
from wandb.sdk.artifacts.storage_handlers.wb_artifact_handler import WBArtifactHandler
|
38
|
-
from wandb.sdk.artifacts.storage_handlers.wb_local_artifact_handler import (
|
39
|
-
WBLocalArtifactHandler,
|
40
|
-
)
|
41
31
|
from wandb.sdk.artifacts.storage_layout import StorageLayout
|
42
32
|
from wandb.sdk.artifacts.storage_policies.register import WANDB_STORAGE_POLICY
|
43
33
|
from wandb.sdk.artifacts.storage_policy import StoragePolicy
|
44
34
|
from wandb.sdk.internal.internal_api import Api as InternalApi
|
45
35
|
from wandb.sdk.internal.thread_local_settings import _thread_local_api_settings
|
46
|
-
from wandb.sdk.lib.hashutil import
|
36
|
+
from wandb.sdk.lib.hashutil import b64_to_hex_id, hex_to_b64_id
|
47
37
|
from wandb.sdk.lib.paths import FilePathStr, URIStr
|
48
38
|
|
39
|
+
from ._factories import make_http_session, make_storage_handlers
|
40
|
+
|
49
41
|
if TYPE_CHECKING:
|
50
42
|
from wandb.filesync.step_prepare import StepPrepare
|
51
43
|
from wandb.sdk.artifacts.artifact import Artifact
|
52
44
|
from wandb.sdk.artifacts.artifact_manifest_entry import ArtifactManifestEntry
|
53
45
|
from wandb.sdk.internal import progress
|
54
46
|
|
55
|
-
# Sleep length: 0, 2, 4, 8, 16, 32, 64, 120, 120, 120, 120, 120, 120, 120, 120, 120
|
56
|
-
# seconds, i.e. a total of 20min 6s.
|
57
|
-
_REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry(
|
58
|
-
backoff_factor=1,
|
59
|
-
total=16,
|
60
|
-
status_forcelist=(308, 408, 409, 429, 500, 502, 503, 504),
|
61
|
-
)
|
62
|
-
_REQUEST_POOL_CONNECTIONS = 64
|
63
|
-
_REQUEST_POOL_MAXSIZE = 64
|
64
47
|
|
65
48
|
# AWS S3 max upload parts without having to make additional requests for extra parts
|
66
49
|
S3_MAX_PART_NUMBERS = 1000
|
@@ -96,48 +79,23 @@ class WandbStoragePolicy(StoragePolicy):
|
|
96
79
|
|
97
80
|
@classmethod
|
98
81
|
def from_config(
|
99
|
-
cls, config: dict, api: InternalApi | None = None
|
82
|
+
cls, config: dict[str, Any], api: InternalApi | None = None
|
100
83
|
) -> WandbStoragePolicy:
|
101
84
|
return cls(config=config, api=api)
|
102
85
|
|
103
86
|
def __init__(
|
104
87
|
self,
|
105
|
-
config: dict | None = None,
|
88
|
+
config: dict[str, Any] | None = None,
|
106
89
|
cache: ArtifactFileCache | None = None,
|
107
90
|
api: InternalApi | None = None,
|
91
|
+
session: requests.Session | None = None,
|
108
92
|
) -> None:
|
109
|
-
self._cache = cache or get_artifact_file_cache()
|
110
93
|
self._config = config or {}
|
111
|
-
self.
|
112
|
-
|
113
|
-
max_retries=_REQUEST_RETRY_STRATEGY,
|
114
|
-
pool_connections=_REQUEST_POOL_CONNECTIONS,
|
115
|
-
pool_maxsize=_REQUEST_POOL_MAXSIZE,
|
116
|
-
)
|
117
|
-
self._session.mount("http://", adapter)
|
118
|
-
self._session.mount("https://", adapter)
|
119
|
-
|
120
|
-
s3 = S3Handler()
|
121
|
-
gcs = GCSHandler()
|
122
|
-
azure = AzureHandler()
|
123
|
-
http = HTTPHandler(self._session)
|
124
|
-
https = HTTPHandler(self._session, scheme="https")
|
125
|
-
artifact = WBArtifactHandler()
|
126
|
-
local_artifact = WBLocalArtifactHandler()
|
127
|
-
file_handler = LocalFileHandler()
|
128
|
-
|
94
|
+
self._cache = cache or get_artifact_file_cache()
|
95
|
+
self._session = session or make_http_session()
|
129
96
|
self._api = api or InternalApi()
|
130
97
|
self._handler = MultiHandler(
|
131
|
-
handlers=
|
132
|
-
s3,
|
133
|
-
gcs,
|
134
|
-
azure,
|
135
|
-
http,
|
136
|
-
https,
|
137
|
-
artifact,
|
138
|
-
local_artifact,
|
139
|
-
file_handler,
|
140
|
-
],
|
98
|
+
handlers=make_storage_handlers(self._session),
|
141
99
|
default_handler=TrackingHandler(),
|
142
100
|
)
|
143
101
|
|
@@ -167,54 +125,52 @@ class WandbStoragePolicy(StoragePolicy):
|
|
167
125
|
self._cache._override_cache_path = dest_path
|
168
126
|
|
169
127
|
path, hit, cache_open = self._cache.check_md5_obj_path(
|
170
|
-
|
171
|
-
|
128
|
+
manifest_entry.digest,
|
129
|
+
size=manifest_entry.size or 0,
|
172
130
|
)
|
173
131
|
if hit:
|
174
132
|
return path
|
175
133
|
|
176
|
-
if manifest_entry._download_url is not None:
|
134
|
+
if (url := manifest_entry._download_url) is not None:
|
177
135
|
# Use multipart parallel download for large file
|
178
136
|
if (
|
179
|
-
executor
|
180
|
-
and manifest_entry.size
|
181
|
-
and self._should_multipart_download(
|
137
|
+
executor
|
138
|
+
and (size := manifest_entry.size)
|
139
|
+
and self._should_multipart_download(size, multipart)
|
182
140
|
):
|
183
|
-
self._multipart_file_download(
|
184
|
-
executor,
|
185
|
-
manifest_entry._download_url,
|
186
|
-
manifest_entry.size,
|
187
|
-
cache_open,
|
188
|
-
)
|
141
|
+
self._multipart_file_download(executor, url, size, cache_open)
|
189
142
|
return path
|
143
|
+
|
190
144
|
# Serial download
|
191
|
-
response = self._session.get(manifest_entry._download_url, stream=True)
|
192
145
|
try:
|
193
|
-
response.
|
194
|
-
except
|
146
|
+
response = self._session.get(url, stream=True)
|
147
|
+
except requests.HTTPError:
|
195
148
|
# Signed URL might have expired, fall back to fetching it one by one.
|
196
149
|
manifest_entry._download_url = None
|
150
|
+
|
197
151
|
if manifest_entry._download_url is None:
|
198
152
|
auth = None
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
153
|
+
headers = _thread_local_api_settings.headers
|
154
|
+
cookies = _thread_local_api_settings.cookies
|
155
|
+
|
156
|
+
# For auth, prefer using (in order): auth header, cookies, HTTP Basic Auth
|
157
|
+
if token := self._api.access_token:
|
158
|
+
headers = {**(headers or {}), "Authorization": f"Bearer {token}"}
|
159
|
+
elif cookies is not None:
|
160
|
+
pass
|
161
|
+
else:
|
203
162
|
auth = ("api", self._api.api_key or "")
|
163
|
+
|
164
|
+
file_url = self._file_url(
|
165
|
+
self._api,
|
166
|
+
artifact.entity,
|
167
|
+
artifact.project,
|
168
|
+
artifact.name.split(":")[0],
|
169
|
+
manifest_entry,
|
170
|
+
)
|
204
171
|
response = self._session.get(
|
205
|
-
|
206
|
-
self._api,
|
207
|
-
artifact.entity,
|
208
|
-
artifact.project,
|
209
|
-
artifact.name.split(":")[0],
|
210
|
-
manifest_entry,
|
211
|
-
),
|
212
|
-
auth=auth,
|
213
|
-
cookies=_thread_local_api_settings.cookies,
|
214
|
-
headers=http_headers,
|
215
|
-
stream=True,
|
172
|
+
file_url, auth=auth, cookies=cookies, headers=headers, stream=True
|
216
173
|
)
|
217
|
-
response.raise_for_status()
|
218
174
|
|
219
175
|
with cache_open(mode="wb") as file:
|
220
176
|
for data in response.iter_content(chunk_size=16 * 1024):
|
@@ -269,12 +225,7 @@ class WandbStoragePolicy(StoragePolicy):
|
|
269
225
|
# Other threads has error, no need to start
|
270
226
|
if download_has_error.is_set():
|
271
227
|
return
|
272
|
-
response = self._session.get(
|
273
|
-
url=download_url,
|
274
|
-
headers=headers,
|
275
|
-
stream=True,
|
276
|
-
)
|
277
|
-
response.raise_for_status()
|
228
|
+
response = self._session.get(url=download_url, headers=headers, stream=True)
|
278
229
|
|
279
230
|
file_offset = start
|
280
231
|
for content in response.iter_content(chunk_size=_HTTP_RES_CHUNK_SIZE_BYTES):
|
@@ -376,43 +327,27 @@ class WandbStoragePolicy(StoragePolicy):
|
|
376
327
|
entity_name: str,
|
377
328
|
project_name: str,
|
378
329
|
artifact_name: str,
|
379
|
-
|
330
|
+
entry: ArtifactManifestEntry,
|
380
331
|
) -> str:
|
381
|
-
|
382
|
-
|
383
|
-
md5_hex = b64_to_hex_id(
|
332
|
+
layout = self._config.get("storageLayout", StorageLayout.V1)
|
333
|
+
region = self._config.get("storageRegion", "default")
|
334
|
+
md5_hex = b64_to_hex_id(entry.digest)
|
384
335
|
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
336
|
+
base_url: str = api.settings("base_url")
|
337
|
+
|
338
|
+
if layout == StorageLayout.V1:
|
339
|
+
return f"{base_url}/artifacts/{entity_name}/{md5_hex}"
|
340
|
+
|
341
|
+
if layout == StorageLayout.V2:
|
342
|
+
birth_artifact_id = entry.birth_artifact_id or ""
|
390
343
|
if api._server_supports(
|
391
|
-
ServerFeature.ARTIFACT_COLLECTION_MEMBERSHIP_FILE_DOWNLOAD_HANDLER
|
344
|
+
ServerFeature.ARTIFACT_COLLECTION_MEMBERSHIP_FILE_DOWNLOAD_HANDLER
|
392
345
|
):
|
393
|
-
return "{}/artifactsV2/{}/{}/{}/{}/{}/{}/{}"
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
quote(artifact_name),
|
399
|
-
quote(manifest_entry.birth_artifact_id or ""),
|
400
|
-
md5_hex,
|
401
|
-
manifest_entry.path.name,
|
402
|
-
)
|
403
|
-
return "{}/artifactsV2/{}/{}/{}/{}".format(
|
404
|
-
api.settings("base_url"),
|
405
|
-
storage_region,
|
406
|
-
entity_name,
|
407
|
-
quote(
|
408
|
-
manifest_entry.birth_artifact_id
|
409
|
-
if manifest_entry.birth_artifact_id is not None
|
410
|
-
else ""
|
411
|
-
),
|
412
|
-
md5_hex,
|
413
|
-
)
|
414
|
-
else:
|
415
|
-
raise Exception(f"unrecognized storage layout: {storage_layout}")
|
346
|
+
return f"{base_url}/artifactsV2/{region}/{quote(entity_name)}/{quote(project_name)}/{quote(artifact_name)}/{quote(birth_artifact_id)}/{md5_hex}/{entry.path.name}"
|
347
|
+
|
348
|
+
return f"{base_url}/artifactsV2/{region}/{entity_name}/{quote(birth_artifact_id)}/{md5_hex}"
|
349
|
+
|
350
|
+
raise ValueError(f"unrecognized storage layout: {layout!r}")
|
416
351
|
|
417
352
|
def s3_multipart_file_upload(
|
418
353
|
self,
|
@@ -486,7 +421,7 @@ class WandbStoragePolicy(StoragePolicy):
|
|
486
421
|
True if the file was a duplicate (did not need to be uploaded),
|
487
422
|
False if it needed to be uploaded or was a reference (nothing to dedupe).
|
488
423
|
"""
|
489
|
-
file_size = entry.size
|
424
|
+
file_size = entry.size or 0
|
490
425
|
chunk_size = self.calc_chunk_size(file_size)
|
491
426
|
upload_parts = []
|
492
427
|
hex_digests = {}
|
@@ -562,8 +497,8 @@ class WandbStoragePolicy(StoragePolicy):
|
|
562
497
|
|
563
498
|
# Cache upon successful upload.
|
564
499
|
_, hit, cache_open = self._cache.check_md5_obj_path(
|
565
|
-
|
566
|
-
|
500
|
+
entry.digest,
|
501
|
+
size=entry.size or 0,
|
567
502
|
)
|
568
503
|
|
569
504
|
staging_dir = get_staging_dir()
|
wandb/sdk/interface/interface.py
CHANGED
@@ -883,6 +883,16 @@ class InterfaceBase:
|
|
883
883
|
) -> MailboxHandle[pb.Result]:
|
884
884
|
raise NotImplementedError
|
885
885
|
|
886
|
+
def publish_probe_system_info(self) -> None:
|
887
|
+
probe_system_info = pb.ProbeSystemInfoRequest()
|
888
|
+
return self._publish_probe_system_info(probe_system_info)
|
889
|
+
|
890
|
+
@abstractmethod
|
891
|
+
def _publish_probe_system_info(
|
892
|
+
self, probe_system_info: pb.ProbeSystemInfoRequest
|
893
|
+
) -> None:
|
894
|
+
raise NotImplementedError
|
895
|
+
|
886
896
|
def join(self) -> None:
|
887
897
|
# Drop indicates that the internal process has already been shutdown
|
888
898
|
if self._drop:
|
@@ -112,6 +112,7 @@ class InterfaceShared(InterfaceBase):
|
|
112
112
|
python_packages: Optional[pb.PythonPackagesRequest] = None,
|
113
113
|
job_input: Optional[pb.JobInputRequest] = None,
|
114
114
|
run_finish_without_exit: Optional[pb.RunFinishWithoutExitRequest] = None,
|
115
|
+
probe_system_info: Optional[pb.ProbeSystemInfoRequest] = None,
|
115
116
|
) -> pb.Record:
|
116
117
|
request = pb.Request()
|
117
118
|
if get_summary:
|
@@ -178,6 +179,8 @@ class InterfaceShared(InterfaceBase):
|
|
178
179
|
request.job_input.CopyFrom(job_input)
|
179
180
|
elif run_finish_without_exit:
|
180
181
|
request.run_finish_without_exit.CopyFrom(run_finish_without_exit)
|
182
|
+
elif probe_system_info:
|
183
|
+
request.probe_system_info.CopyFrom(probe_system_info)
|
181
184
|
else:
|
182
185
|
raise Exception("Invalid request")
|
183
186
|
record = self._make_record(request=request)
|
@@ -330,6 +333,12 @@ class InterfaceShared(InterfaceBase):
|
|
330
333
|
rec = self._make_record(use_artifact=use_artifact)
|
331
334
|
self._publish(rec)
|
332
335
|
|
336
|
+
def _publish_probe_system_info(
|
337
|
+
self, probe_system_info: pb.ProbeSystemInfoRequest
|
338
|
+
) -> None:
|
339
|
+
record = self._make_request(probe_system_info=probe_system_info)
|
340
|
+
self._publish(record)
|
341
|
+
|
333
342
|
def _deliver_artifact(
|
334
343
|
self,
|
335
344
|
log_artifact: pb.LogArtifactRequest,
|
wandb/sdk/wandb_init.py
CHANGED
@@ -839,6 +839,13 @@ class _WandbInit:
|
|
839
839
|
" and reinit is set to 'create_new', so continuing"
|
840
840
|
)
|
841
841
|
|
842
|
+
elif settings.resume == "must":
|
843
|
+
raise wandb.Error(
|
844
|
+
"Cannot resume a run while another run is active."
|
845
|
+
" You must either finish it using run.finish(),"
|
846
|
+
" or use reinit='create_new' when calling wandb.init()."
|
847
|
+
)
|
848
|
+
|
842
849
|
else:
|
843
850
|
run_printer.display(
|
844
851
|
"wandb.init() called while a run is active and reinit is"
|
@@ -864,7 +871,6 @@ class _WandbInit:
|
|
864
871
|
backend.ensure_launched()
|
865
872
|
self._logger.info("backend started and connected")
|
866
873
|
|
867
|
-
# resuming needs access to the server, check server_status()?
|
868
874
|
run = Run(
|
869
875
|
config=config.base_no_artifacts,
|
870
876
|
settings=settings,
|
@@ -1019,6 +1025,8 @@ class _WandbInit:
|
|
1019
1025
|
except TimeoutError:
|
1020
1026
|
pass
|
1021
1027
|
|
1028
|
+
backend.interface.publish_probe_system_info()
|
1029
|
+
|
1022
1030
|
assert self._wl is not None
|
1023
1031
|
self.run = run
|
1024
1032
|
|
wandb/wandb_agent.py
CHANGED
@@ -42,11 +42,42 @@ class AgentProcess:
|
|
42
42
|
if command:
|
43
43
|
if platform.system() == "Windows":
|
44
44
|
kwargs = dict(creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
|
45
|
+
env.pop(wandb.env.SERVICE, None)
|
46
|
+
# TODO: Determine if we need the same stdin workaround as POSIX case below.
|
47
|
+
self._popen = subprocess.Popen(command, env=env, **kwargs)
|
45
48
|
else:
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
49
|
+
if sys.version_info >= (3, 11):
|
50
|
+
# preexec_fn=os.setpgrp is not thread-safe; process_group was introduced in
|
51
|
+
# python 3.11 to replace it, so use that when possible
|
52
|
+
kwargs = dict(process_group=0)
|
53
|
+
else:
|
54
|
+
kwargs = dict(preexec_fn=os.setpgrp)
|
55
|
+
env.pop(wandb.env.SERVICE, None)
|
56
|
+
# Upon spawning the subprocess in a new process group, the child's process group is
|
57
|
+
# not connected to the controlling terminal's stdin. If it tries to access stdin,
|
58
|
+
# it gets a SIGTTIN and blocks until we give it the terminal, which we don't want
|
59
|
+
# to do.
|
60
|
+
#
|
61
|
+
# By using subprocess.PIPE, we give it an independent stdin. However, it will still
|
62
|
+
# block if it tries to read from stdin, because we're not writing anything to it.
|
63
|
+
# We immediately close the subprocess's stdin here so it can fail fast and get an
|
64
|
+
# EOF.
|
65
|
+
#
|
66
|
+
# (One situation that makes this relevant is that importing `readline` even
|
67
|
+
# indirectly can cause the child to attempt to access stdin, which can trigger the
|
68
|
+
# deadlock. In Python 3.13, `import torch` indirectly imports `readline` via `pdb`,
|
69
|
+
# meaning `import torch` in a run script can deadlock unless we override stdin.
|
70
|
+
# See https://github.com/wandb/wandb/pull/10489 description for more details.)
|
71
|
+
#
|
72
|
+
# Also, we avoid spawning a new session because that breaks preempted child process
|
73
|
+
# handling.
|
74
|
+
self._popen = subprocess.Popen(
|
75
|
+
command,
|
76
|
+
env=env,
|
77
|
+
stdin=subprocess.PIPE,
|
78
|
+
**kwargs,
|
79
|
+
)
|
80
|
+
self._popen.stdin.close()
|
50
81
|
elif function:
|
51
82
|
self._proc = multiprocessing.Process(
|
52
83
|
target=self._start,
|