pluto-ml 0.0.22__tar.gz → 0.0.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/PKG-INFO +1 -1
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/__init__.py +2 -2
- pluto_ml-0.0.24/pluto/_fs.py +96 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/api.py +10 -3
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/wandb.py +211 -21
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/file.py +24 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/iface.py +5 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/init.py +43 -2
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/op.py +70 -3
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sets.py +11 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/process.py +16 -10
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/store.py +92 -16
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/util.py +78 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pyproject.toml +3 -1
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/LICENSE +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/README.md +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/__init__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/__main__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/__init__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/lightning.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/neptune.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/torch.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/transformers.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/__main__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/_wandb_hook.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/auth.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/__init__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/_utils.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/lightning.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune_query/__init__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune_query/filters.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune_query/runs.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/torch.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/transformers.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/data.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/log.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/query.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sanitize.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sentry.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/store.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/__init__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/__main__.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/retry.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sys.py +0 -0
- {pluto_ml-0.0.22 → pluto_ml-0.0.24}/zzzz_pluto_wandb_hook.pth +0 -0
|
@@ -41,11 +41,11 @@ __all__ = (
|
|
|
41
41
|
'generate_run_id',
|
|
42
42
|
)
|
|
43
43
|
|
|
44
|
-
__version__ = '0.0.
|
|
44
|
+
__version__ = '0.0.24'
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
# Replaced with the current commit when building the wheels.
|
|
48
|
-
_PLUTO_COMMIT_SHA = '
|
|
48
|
+
_PLUTO_COMMIT_SHA = '83cf832b478eb34d6458bd346b3c29cb6fa71a80'
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def _get_git_commit():
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Filesystem detection helpers.
|
|
2
|
+
|
|
3
|
+
Pluto stages local state — most importantly the WAL-mode SQLite database used
|
|
4
|
+
to hand data from the training process to the sync process — under the run
|
|
5
|
+
directory. SQLite's WAL locking relies on POSIX byte-range locks plus a shared
|
|
6
|
+
-shm mmap, neither of which behaves reliably on network filesystems (NFS,
|
|
7
|
+
Lustre, SMB/CIFS, ...). On those mounts the lock handoff degrades into
|
|
8
|
+
SQLITE_PROTOCOL ("locking protocol") races that show up as repeated lock
|
|
9
|
+
retries and can badly throttle logging.
|
|
10
|
+
|
|
11
|
+
These helpers let `init()` detect that situation up front and tell the user to
|
|
12
|
+
point the staging dir (``pluto.init(dir=...)`` / ``PLUTO_DIR``) at node-local
|
|
13
|
+
storage. Detection is best-effort and Linux-only — on any other platform, or if
|
|
14
|
+
the mount table can't be read, we return ``None``/``False`` and stay silent
|
|
15
|
+
rather than risk a false alarm.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
from typing import Optional
|
|
21
|
+
|
|
22
|
+
# /proc/self/mountinfo escapes space, tab, newline and backslash in paths and
|
|
23
|
+
# fstypes as octal sequences (\040, \011, \012, \134). Decode them so prefix
|
|
24
|
+
# matching works for mount points that contain such characters.
|
|
25
|
+
_OCTAL_ESCAPE = re.compile(r'\\([0-7]{3})')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _unescape_mountinfo(field: str) -> str:
|
|
29
|
+
return _OCTAL_ESCAPE.sub(lambda m: chr(int(m.group(1), 8)), field)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Filesystem type prefixes (as reported in /proc/self/mountinfo) whose locking
|
|
33
|
+
# semantics are unreliable for WAL-mode SQLite. Matched as prefixes so that
|
|
34
|
+
# e.g. both "nfs" and "nfs4", or "fuse.sshfs", are covered.
|
|
35
|
+
_NETWORK_FS_PREFIXES = (
|
|
36
|
+
'nfs',
|
|
37
|
+
'cifs',
|
|
38
|
+
'smb',
|
|
39
|
+
'lustre',
|
|
40
|
+
'gpfs',
|
|
41
|
+
'ceph',
|
|
42
|
+
'glusterfs',
|
|
43
|
+
'afs',
|
|
44
|
+
'ncpfs',
|
|
45
|
+
'fuse.sshfs',
|
|
46
|
+
'fuse.glusterfs',
|
|
47
|
+
'beegfs',
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_fs_type(path: str) -> Optional[str]:
|
|
52
|
+
"""Return the filesystem type backing ``path``, or ``None`` if unknown.
|
|
53
|
+
|
|
54
|
+
Linux-only: reads ``/proc/self/mountinfo`` and returns the type of the
|
|
55
|
+
most specific (longest) mount point that is a prefix of ``path``. The path
|
|
56
|
+
need not exist yet — matching is done on the resolved path string, so a
|
|
57
|
+
not-yet-created run directory still resolves to its parent mount. Returns
|
|
58
|
+
``None`` on non-Linux platforms or if the mount table can't be parsed.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
target = os.path.realpath(path)
|
|
62
|
+
best_mount = ''
|
|
63
|
+
best_type: Optional[str] = None
|
|
64
|
+
with open('/proc/self/mountinfo', 'r') as f:
|
|
65
|
+
for line in f:
|
|
66
|
+
# Format: "<id> <pid> <maj:min> <root> <mountpoint> <opts> \
|
|
67
|
+
# [optional fields] - <fstype> <source> <superopts>"
|
|
68
|
+
left, sep, right = line.partition(' - ')
|
|
69
|
+
if not sep:
|
|
70
|
+
continue
|
|
71
|
+
left_fields = left.split()
|
|
72
|
+
right_fields = right.split()
|
|
73
|
+
if len(left_fields) < 5 or not right_fields:
|
|
74
|
+
continue
|
|
75
|
+
mount_point = _unescape_mountinfo(left_fields[4])
|
|
76
|
+
fstype = _unescape_mountinfo(right_fields[0])
|
|
77
|
+
# Longest mount point that is a path-prefix of target wins.
|
|
78
|
+
if target == mount_point or target.startswith(
|
|
79
|
+
mount_point.rstrip('/') + '/'
|
|
80
|
+
):
|
|
81
|
+
if len(mount_point) >= len(best_mount):
|
|
82
|
+
best_mount = mount_point
|
|
83
|
+
best_type = fstype
|
|
84
|
+
return best_type
|
|
85
|
+
except (OSError, ValueError, IndexError):
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def is_network_fs(path: str) -> bool:
|
|
90
|
+
"""True if ``path`` appears to live on a network filesystem.
|
|
91
|
+
|
|
92
|
+
Best-effort and conservative: returns ``False`` when the filesystem type
|
|
93
|
+
can't be determined (e.g. non-Linux), so callers never warn spuriously.
|
|
94
|
+
"""
|
|
95
|
+
fstype = get_fs_type(path)
|
|
96
|
+
return fstype is not None and fstype.lower().startswith(_NETWORK_FS_PREFIXES)
|
|
@@ -4,7 +4,7 @@ import re
|
|
|
4
4
|
import signal
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
|
|
7
|
-
from .util import clean_dict, find_node
|
|
7
|
+
from .util import clean_dict, config_json_default, find_node
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(f'{__name__.split(".")[0]}')
|
|
10
10
|
tag = 'API'
|
|
@@ -40,7 +40,9 @@ def make_compat_start_v1(config, settings, info, tags=None):
|
|
|
40
40
|
'runName': settings._op_name,
|
|
41
41
|
'projectName': settings.project,
|
|
42
42
|
'externalId': settings._external_id, # User-provided run ID for multi-node
|
|
43
|
-
'config': json.dumps(config)
|
|
43
|
+
'config': json.dumps(config, default=config_json_default)
|
|
44
|
+
if config is not None
|
|
45
|
+
else None,
|
|
44
46
|
'loggerSettings': json.dumps(clean_dict(settings.to_dict())),
|
|
45
47
|
'systemMetadata': json.dumps(info) if info is not None else None,
|
|
46
48
|
'tags': tags if tags else None,
|
|
@@ -97,7 +99,9 @@ def make_compat_update_config_v1(settings, config):
|
|
|
97
99
|
return json.dumps(
|
|
98
100
|
{
|
|
99
101
|
'runId': settings._op_id,
|
|
100
|
-
'config': json.dumps(config)
|
|
102
|
+
'config': json.dumps(config, default=config_json_default)
|
|
103
|
+
if config
|
|
104
|
+
else None,
|
|
101
105
|
}
|
|
102
106
|
).encode()
|
|
103
107
|
|
|
@@ -166,6 +170,9 @@ def make_compat_file_v1(file, timestamp, step):
|
|
|
166
170
|
'logName': k,
|
|
167
171
|
'step': step,
|
|
168
172
|
}
|
|
173
|
+
caption = getattr(f, '_caption', None)
|
|
174
|
+
if caption is not None:
|
|
175
|
+
i['caption'] = caption
|
|
169
176
|
batch.append(i)
|
|
170
177
|
return json.dumps({'files': batch}).encode()
|
|
171
178
|
|
|
@@ -41,6 +41,8 @@ Hard Requirements:
|
|
|
41
41
|
"""
|
|
42
42
|
|
|
43
43
|
import atexit
|
|
44
|
+
import copy
|
|
45
|
+
import json
|
|
44
46
|
import logging
|
|
45
47
|
import os
|
|
46
48
|
import threading
|
|
@@ -55,6 +57,9 @@ from ._utils import (
|
|
|
55
57
|
|
|
56
58
|
logger = logging.getLogger(__name__)
|
|
57
59
|
|
|
60
|
+
# Distinct from None so config dedup can tell "never logged" from "logged None".
|
|
61
|
+
_MISSING = object()
|
|
62
|
+
|
|
58
63
|
_original_wandb_init = None
|
|
59
64
|
_original_wandb_log = None
|
|
60
65
|
_original_wandb_finish = None
|
|
@@ -91,6 +96,13 @@ class WandbRunWrapper:
|
|
|
91
96
|
self._fallback_step = 0 # Used when wandb is disabled (_step won't increment)
|
|
92
97
|
self._closed = False
|
|
93
98
|
self._close_lock = threading.Lock()
|
|
99
|
+
# Keys we've already warned about being unforwardable to Pluto, so a
|
|
100
|
+
# value logged every step warns once rather than spamming the logs.
|
|
101
|
+
self._unforwardable_warned: set = set()
|
|
102
|
+
# Last config values we synced to Pluto, keyed by log key. Lets us skip
|
|
103
|
+
# redundant update_config() calls when a str/bool/config value is logged
|
|
104
|
+
# unchanged every step (a common pattern: phase/status/checkpoint paths).
|
|
105
|
+
self._last_logged_config: Dict[str, Any] = {}
|
|
94
106
|
|
|
95
107
|
if self._pluto_run:
|
|
96
108
|
atexit.register(self._atexit_cleanup_pluto)
|
|
@@ -158,7 +170,26 @@ class WandbRunWrapper:
|
|
|
158
170
|
logger.debug(f'pluto.compat.wandb: Pluto finish timed out after {timeout}s')
|
|
159
171
|
|
|
160
172
|
def log(self, data: Dict[str, Any], step=None, commit=None, **kwargs):
|
|
161
|
-
"""Log metrics to both wandb and Pluto.
|
|
173
|
+
"""Log metrics to both wandb and Pluto.
|
|
174
|
+
|
|
175
|
+
Value routing for the Pluto side:
|
|
176
|
+
- int/float and any scalar exposing .item() (numpy/torch/etc.)
|
|
177
|
+
-> Pluto metrics (time-series), matching Pluto core's own log()
|
|
178
|
+
- wandb media (Image/Video/Audio/Histogram/Table), and lists
|
|
179
|
+
thereof -> converted Pluto media
|
|
180
|
+
- str and bool -> Pluto config (latest-wins). Pluto has no
|
|
181
|
+
string/bool time-series metric, so these mirror wandb's
|
|
182
|
+
summary/overview placement and stay queryable via
|
|
183
|
+
get_run().config.
|
|
184
|
+
- anything else with no metric/media mapping -> preserved as
|
|
185
|
+
config if it survives update_config's normalization (incl.
|
|
186
|
+
OmegaConf), otherwise dropped and reported to Sentry telemetry
|
|
187
|
+
once per key (a maintainer-coverage signal, not a user-facing
|
|
188
|
+
warning). See _handle_unforwardable.
|
|
189
|
+
|
|
190
|
+
str/bool/config values are deduped against the last synced value, so
|
|
191
|
+
logging an unchanged value every step doesn't spam update_config.
|
|
192
|
+
"""
|
|
162
193
|
# Determine the step to use for Pluto.
|
|
163
194
|
# When step is explicit, use it. Otherwise:
|
|
164
195
|
# - Normal mode: read wandb's _step before log() increments it
|
|
@@ -186,11 +217,29 @@ class WandbRunWrapper:
|
|
|
186
217
|
# Pluto.log() natively supports lists, so we just need
|
|
187
218
|
# to convert each element and pass the list through.
|
|
188
219
|
pluto_data: Dict[str, Any] = {}
|
|
220
|
+
# String values have no time-series metric equivalent in
|
|
221
|
+
# Pluto (op._process_log_item_sync only keeps int/float/
|
|
222
|
+
# tensor/File/Data). wandb puts loose strings in the run
|
|
223
|
+
# summary/overview; the closest Pluto analogue is config,
|
|
224
|
+
# which is latest-wins and queryable via get_run().config.
|
|
225
|
+
# This is what lets e.g. a resume skill read back the most
|
|
226
|
+
# recent checkpoint/r2_path for a run.
|
|
227
|
+
pluto_config: Dict[str, Any] = {}
|
|
189
228
|
for key, value in data.items():
|
|
190
|
-
if isinstance(value,
|
|
229
|
+
if isinstance(value, bool):
|
|
230
|
+
# bool is a subclass of int, but Pluto drops bool
|
|
231
|
+
# metrics — surface it as config so it isn't lost.
|
|
232
|
+
# Skip if unchanged since last log (avoid redundant
|
|
233
|
+
# config writes when logged every step).
|
|
234
|
+
if self._last_logged_config.get(key, _MISSING) != value:
|
|
235
|
+
pluto_config[key] = value
|
|
236
|
+
elif isinstance(value, (int, float)):
|
|
191
237
|
pluto_data[key] = value
|
|
192
|
-
elif
|
|
193
|
-
pluto_data[key] =
|
|
238
|
+
elif (num := _as_scalar_number(value)) is not None:
|
|
239
|
+
pluto_data[key] = num
|
|
240
|
+
elif isinstance(value, str):
|
|
241
|
+
if self._last_logged_config.get(key, _MISSING) != value:
|
|
242
|
+
pluto_config[key] = value
|
|
194
243
|
elif isinstance(value, (list, tuple)):
|
|
195
244
|
# List of wandb media — convert each element.
|
|
196
245
|
converted_items = []
|
|
@@ -200,22 +249,107 @@ class WandbRunWrapper:
|
|
|
200
249
|
converted_items.append(c)
|
|
201
250
|
if converted_items:
|
|
202
251
|
pluto_data[key] = converted_items
|
|
252
|
+
else:
|
|
253
|
+
# Not a media list (e.g. list of primitives) —
|
|
254
|
+
# preserve as config if possible, else warn.
|
|
255
|
+
self._handle_unforwardable(key, value, pluto_config)
|
|
203
256
|
else:
|
|
204
257
|
# Try to convert wandb data types to pluto equivalents
|
|
205
258
|
converted = _convert_wandb_to_pluto(key, value, self._pluto)
|
|
206
259
|
if converted is not None:
|
|
207
260
|
pluto_data[key] = converted
|
|
208
|
-
|
|
261
|
+
else:
|
|
262
|
+
# No metric/media mapping — last-resort handling
|
|
263
|
+
# so the value is never silently dropped.
|
|
264
|
+
self._handle_unforwardable(key, value, pluto_config)
|
|
265
|
+
|
|
266
|
+
# Metrics and config are sent in independent try blocks: a
|
|
267
|
+
# failure logging metrics must NOT skip the config update (or
|
|
268
|
+
# vice versa) — str/bool from the same wandb.log() call live in
|
|
269
|
+
# config and would otherwise be silently lost.
|
|
209
270
|
if pluto_data:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
271
|
+
try:
|
|
272
|
+
log_kwargs = {}
|
|
273
|
+
if actual_step is not None:
|
|
274
|
+
log_kwargs['step'] = actual_step
|
|
275
|
+
self._pluto_run.log(pluto_data, **log_kwargs)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.debug(
|
|
278
|
+
f'pluto.compat.wandb: Failed to log metrics to Pluto: {e}'
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if pluto_config:
|
|
282
|
+
try:
|
|
283
|
+
self._pluto_run.update_config(pluto_config)
|
|
284
|
+
# Only remember as synced once the update succeeds.
|
|
285
|
+
# deepcopy so the dedup snapshot can't share a reference
|
|
286
|
+
# with a caller-owned mutable: today pluto_config holds
|
|
287
|
+
# only immutable str/bool or a fresh to_native_config
|
|
288
|
+
# rebuild, but copying keeps the != comparison correct
|
|
289
|
+
# even if a future branch stores a user object directly.
|
|
290
|
+
self._last_logged_config.update(copy.deepcopy(pluto_config))
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.debug(
|
|
293
|
+
f'pluto.compat.wandb: Failed to sync config to Pluto: {e}'
|
|
294
|
+
)
|
|
214
295
|
except Exception as e:
|
|
215
|
-
logger.debug(f'pluto.compat.wandb: Failed to
|
|
296
|
+
logger.debug(f'pluto.compat.wandb: Failed to prepare Pluto data: {e}')
|
|
216
297
|
|
|
217
298
|
return result
|
|
218
299
|
|
|
300
|
+
def _handle_unforwardable(self, key, value, pluto_config: Dict[str, Any]) -> None:
|
|
301
|
+
"""Last-resort handling for a value with no metric/media mapping.
|
|
302
|
+
|
|
303
|
+
Pluto only stores numbers (metrics), media/structured data, and
|
|
304
|
+
config — so values outside those (dicts, None, raw/multi-element
|
|
305
|
+
tensors, numpy arrays, unconvertible wandb media like Html/Object3D,
|
|
306
|
+
custom objects) have nowhere to go. Rather than dropping them
|
|
307
|
+
silently — which is what made missing data so hard to diagnose —
|
|
308
|
+
we:
|
|
309
|
+
|
|
310
|
+
1. Preserve the value as config if it survives update_config's own
|
|
311
|
+
normalization (mirrors how wandb keeps loose values in the run
|
|
312
|
+
summary). This covers nested dicts/lists of primitives, None, and
|
|
313
|
+
OmegaConf DictConfig/ListConfig nodes (which to_native_config
|
|
314
|
+
deep-converts). Skipped if unchanged since the last log.
|
|
315
|
+
2. Otherwise drop the Pluto copy (it still reached W&B) and report
|
|
316
|
+
it as a maintainer-coverage signal via Sentry telemetry — once
|
|
317
|
+
per key. This is a gap in OUR type handling, not a user error,
|
|
318
|
+
so we deliberately do NOT emit a user-facing warning: people
|
|
319
|
+
migrating away from wandb shouldn't be nagged about types only
|
|
320
|
+
we can fix. The local log stays at debug for self-host
|
|
321
|
+
debugging.
|
|
322
|
+
"""
|
|
323
|
+
storable, native = _config_storable_value(value)
|
|
324
|
+
if storable:
|
|
325
|
+
if self._last_logged_config.get(key, _MISSING) != native:
|
|
326
|
+
pluto_config[key] = native
|
|
327
|
+
return
|
|
328
|
+
if key in self._unforwardable_warned:
|
|
329
|
+
return
|
|
330
|
+
self._unforwardable_warned.add(key)
|
|
331
|
+
type_name = type(value).__name__
|
|
332
|
+
# Quiet locally (debug only) — not a user-actionable problem.
|
|
333
|
+
logger.debug(
|
|
334
|
+
'pluto.compat.wandb: not forwarding %r to Pluto — type %s has no '
|
|
335
|
+
'metric/media/config mapping (still logged to W&B).',
|
|
336
|
+
key,
|
|
337
|
+
type_name,
|
|
338
|
+
)
|
|
339
|
+
# Alert us (the maintainers) so we can add coverage for the type.
|
|
340
|
+
# Message is keyed on the type (not the run-specific key) so Sentry
|
|
341
|
+
# groups all occurrences of the same unhandled type together.
|
|
342
|
+
try:
|
|
343
|
+
from pluto import sentry
|
|
344
|
+
|
|
345
|
+
sentry.capture_message(
|
|
346
|
+
f'wandb compat: unforwardable Pluto log value of type '
|
|
347
|
+
f'{type_name!r} (no metric/media/config mapping)',
|
|
348
|
+
level='warning',
|
|
349
|
+
)
|
|
350
|
+
except Exception:
|
|
351
|
+
pass
|
|
352
|
+
|
|
219
353
|
def finish(self, exit_code=None, quiet=None):
|
|
220
354
|
"""Finish both wandb and Pluto runs."""
|
|
221
355
|
with self._close_lock:
|
|
@@ -498,14 +632,56 @@ def _resolve_wandb_to_pluto_run(wandb_run_id, project):
|
|
|
498
632
|
return None
|
|
499
633
|
|
|
500
634
|
|
|
501
|
-
def
|
|
502
|
-
"""
|
|
635
|
+
def _as_scalar_number(value):
|
|
636
|
+
"""Return value as a python int/float if it's a scalar number, else None.
|
|
637
|
+
|
|
638
|
+
Mirrors Pluto's own log() (op._process_log_item_sync), which forwards
|
|
639
|
+
anything exposing a callable ``.item()``. The shim previously only
|
|
640
|
+
accepted plain int/float and torch scalar tensors, so a value logged as
|
|
641
|
+
a numpy scalar (``np.int64``), a 0-d numpy array, or a non-torch 0-d
|
|
642
|
+
tensor was dropped here even though Pluto core would have kept it — e.g.
|
|
643
|
+
an ``epoch`` that is ``np.int64`` rather than a plain ``int``.
|
|
644
|
+
|
|
645
|
+
bool and str are excluded (Pluto drops bool metrics; str routes to
|
|
646
|
+
config). ``.item()`` on a multi-element array/tensor raises — we treat
|
|
647
|
+
that as "not a scalar" and return None, same as Pluto would fail it.
|
|
648
|
+
"""
|
|
649
|
+
if isinstance(value, (bool, str)):
|
|
650
|
+
return None
|
|
651
|
+
item = getattr(value, 'item', None)
|
|
652
|
+
if not callable(item):
|
|
653
|
+
return None
|
|
654
|
+
try:
|
|
655
|
+
result = item()
|
|
656
|
+
except Exception:
|
|
657
|
+
return None
|
|
658
|
+
if isinstance(result, bool) or not isinstance(result, (int, float)):
|
|
659
|
+
return None
|
|
660
|
+
return result
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _config_storable_value(value):
|
|
664
|
+
"""Return ``(storable, native)`` for the config fallback.
|
|
665
|
+
|
|
666
|
+
Mirrors what ``update_config`` actually does — normalize via
|
|
667
|
+
``to_native_config`` (which deep-converts OmegaConf ``DictConfig`` /
|
|
668
|
+
``ListConfig`` to native containers), then check JSON-serializability.
|
|
669
|
+
Keeping the gate in lockstep with ``update_config`` means a logged
|
|
670
|
+
``DictConfig`` is correctly stored as config, even though plain
|
|
671
|
+
``json.dumps`` would reject it. Tensors / ndarrays / custom objects still
|
|
672
|
+
fail (``to_native_config`` leaves them as-is) and fall through to the
|
|
673
|
+
Sentry path.
|
|
674
|
+
|
|
675
|
+
Returns ``(True, native_value)`` when storable, else ``(False, None)``.
|
|
676
|
+
"""
|
|
503
677
|
try:
|
|
504
|
-
import
|
|
678
|
+
from pluto.util import to_native_config
|
|
505
679
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
return
|
|
680
|
+
native = to_native_config(value)
|
|
681
|
+
json.dumps(native)
|
|
682
|
+
return True, native
|
|
683
|
+
except Exception:
|
|
684
|
+
return False, None
|
|
509
685
|
|
|
510
686
|
|
|
511
687
|
def _is_torch_distributed() -> bool:
|
|
@@ -531,6 +707,17 @@ def _is_torch_distributed() -> bool:
|
|
|
531
707
|
return False
|
|
532
708
|
|
|
533
709
|
|
|
710
|
+
def _wandb_caption(value):
|
|
711
|
+
"""Extract a user-provided caption from a wandb media object.
|
|
712
|
+
|
|
713
|
+
wandb.Image/Audio/Video store the ``caption=`` kwarg on ``_caption``.
|
|
714
|
+
Returns a non-empty string or None (ignores wandb's list-of-captions
|
|
715
|
+
grouping form, which has no single-file equivalent here).
|
|
716
|
+
"""
|
|
717
|
+
cap = getattr(value, '_caption', None)
|
|
718
|
+
return cap if isinstance(cap, str) and cap else None
|
|
719
|
+
|
|
720
|
+
|
|
534
721
|
def _convert_wandb_to_pluto(key, value, pluto_module):
|
|
535
722
|
"""
|
|
536
723
|
Convert wandb data types to Pluto equivalents.
|
|
@@ -550,15 +737,16 @@ def _convert_wandb_to_pluto(key, value, pluto_module):
|
|
|
550
737
|
# which does NOT match subclasses, so we can't pass the PIL
|
|
551
738
|
# object directly. Instead, use the file path — wandb.Image
|
|
552
739
|
# always writes to _path on construction.
|
|
740
|
+
caption = _wandb_caption(value)
|
|
553
741
|
if getattr(value, '_path', None):
|
|
554
|
-
return pluto_module.Image(value._path)
|
|
742
|
+
return pluto_module.Image(value._path, caption=caption)
|
|
555
743
|
# Fallback: convert PIL to numpy (which pluto.Image handles)
|
|
556
744
|
pil_img = getattr(value, 'image', None) or getattr(value, '_image', None)
|
|
557
745
|
if pil_img is not None:
|
|
558
746
|
try:
|
|
559
747
|
import numpy as np
|
|
560
748
|
|
|
561
|
-
return pluto_module.Image(np.asarray(pil_img))
|
|
749
|
+
return pluto_module.Image(np.asarray(pil_img), caption=caption)
|
|
562
750
|
except Exception:
|
|
563
751
|
return None
|
|
564
752
|
return None
|
|
@@ -578,14 +766,14 @@ def _convert_wandb_to_pluto(key, value, pluto_module):
|
|
|
578
766
|
# wandb.Audio always writes to _path on construction
|
|
579
767
|
# (whether from numpy, file path, or bytes).
|
|
580
768
|
if getattr(value, '_path', None):
|
|
581
|
-
return pluto_module.Audio(value._path)
|
|
769
|
+
return pluto_module.Audio(value._path, caption=_wandb_caption(value))
|
|
582
770
|
return None
|
|
583
771
|
|
|
584
772
|
if type_name == 'Video':
|
|
585
773
|
# wandb.Video always writes to _path on construction (after
|
|
586
774
|
# encoding). This can take a few seconds for numpy input.
|
|
587
775
|
if getattr(value, '_path', None):
|
|
588
|
-
return pluto_module.Video(value._path)
|
|
776
|
+
return pluto_module.Video(value._path, caption=_wandb_caption(value))
|
|
589
777
|
return None
|
|
590
778
|
|
|
591
779
|
if type_name == 'Table':
|
|
@@ -851,7 +1039,9 @@ def _make_patched_init(original_init, wandb_module):
|
|
|
851
1039
|
f'wandb will continue to work normally, but NO DATA will be '
|
|
852
1040
|
f'sent to Pluto. To fix, resolve the error above and retry.'
|
|
853
1041
|
)
|
|
854
|
-
|
|
1042
|
+
# exc_info=True attaches the traceback so the log points at the
|
|
1043
|
+
# raise site (e.g. the failing json.dumps), not just this handler.
|
|
1044
|
+
logger.error(_msg, exc_info=True)
|
|
855
1045
|
# Also print to stderr so it shows up even if logging is not configured
|
|
856
1046
|
import sys
|
|
857
1047
|
|
|
@@ -23,6 +23,10 @@ INVALID_CHAR = re.compile(r'[^a-zA-Z0-9_\-.]')
|
|
|
23
23
|
|
|
24
24
|
class File:
|
|
25
25
|
tag = tag
|
|
26
|
+
# Optional user-provided caption (e.g. Image(caption=...)). Media subclasses
|
|
27
|
+
# override this instance attribute in their __init__; the class-level default
|
|
28
|
+
# ensures it always exists (e.g. on a directly-constructed File).
|
|
29
|
+
_caption: Optional[str] = None
|
|
26
30
|
|
|
27
31
|
def __init__(
|
|
28
32
|
self,
|
|
@@ -112,6 +116,10 @@ class Artifact(File):
|
|
|
112
116
|
self._tmp: Optional[str] = None
|
|
113
117
|
self._ext: str = ''
|
|
114
118
|
self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
|
|
119
|
+
# Preserve the raw caption separately so it can be sent to the server
|
|
120
|
+
# as a dedicated field (mlop_files.caption); _name keeps the legacy
|
|
121
|
+
# caption-as-filename behavior for back-compat with older servers.
|
|
122
|
+
self._caption = caption
|
|
115
123
|
self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
|
|
116
124
|
|
|
117
125
|
self._metadata: Dict[str, Any] = metadata or {}
|
|
@@ -148,6 +156,10 @@ class Text(File):
|
|
|
148
156
|
|
|
149
157
|
def __init__(self, data: Union[str, Any], caption: Optional[str] = None) -> None:
|
|
150
158
|
self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
|
|
159
|
+
# Preserve the raw caption separately so it can be sent to the server
|
|
160
|
+
# as a dedicated field (mlop_files.caption); _name keeps the legacy
|
|
161
|
+
# caption-as-filename behavior for back-compat with older servers.
|
|
162
|
+
self._caption = caption
|
|
151
163
|
self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
|
|
152
164
|
self._ext = '.txt'
|
|
153
165
|
self._path: Optional[str] = None
|
|
@@ -182,6 +194,10 @@ class Image(File):
|
|
|
182
194
|
caption: Optional[str] = None,
|
|
183
195
|
) -> None:
|
|
184
196
|
self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
|
|
197
|
+
# Preserve the raw caption separately so it can be sent to the server
|
|
198
|
+
# as a dedicated field (mlop_files.caption); _name keeps the legacy
|
|
199
|
+
# caption-as-filename behavior for back-compat with older servers.
|
|
200
|
+
self._caption = caption
|
|
185
201
|
self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
|
|
186
202
|
self._ext = '.png'
|
|
187
203
|
self._image: Any = None
|
|
@@ -252,6 +268,10 @@ class Audio(File):
|
|
|
252
268
|
rate = kwargs.get('sample_rate', rate)
|
|
253
269
|
|
|
254
270
|
self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
|
|
271
|
+
# Preserve the raw caption separately so it can be sent to the server
|
|
272
|
+
# as a dedicated field (mlop_files.caption); _name keeps the legacy
|
|
273
|
+
# caption-as-filename behavior for back-compat with older servers.
|
|
274
|
+
self._caption = caption
|
|
255
275
|
self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
|
|
256
276
|
self._ext = '.wav'
|
|
257
277
|
self._audio: Any
|
|
@@ -305,6 +325,10 @@ class Video(File):
|
|
|
305
325
|
rate = kwargs.get('fps', rate)
|
|
306
326
|
|
|
307
327
|
self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
|
|
328
|
+
# Preserve the raw caption separately so it can be sent to the server
|
|
329
|
+
# as a dedicated field (mlop_files.caption); _name keeps the legacy
|
|
330
|
+
# caption-as-filename behavior for back-compat with older servers.
|
|
331
|
+
self._caption = caption
|
|
308
332
|
self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
|
|
309
333
|
self._ext = f'.{format}' if format in ['mp4', 'webm', 'ogg', 'gif'] else '.mp4'
|
|
310
334
|
self._path: Optional[str] = None
|
|
@@ -187,12 +187,16 @@ class ServerInterface:
|
|
|
187
187
|
num: List of numeric metric names
|
|
188
188
|
df: Dict mapping file type names to lists of log names
|
|
189
189
|
"""
|
|
190
|
+
# Suppress the per-request httpx INFO line ("HTTP Request: POST
|
|
191
|
+
# .../api/runs/logName/add ..."). One POST fires per new metric/file
|
|
192
|
+
# name, so this is noisy; the heartbeat/status path suppresses it too.
|
|
190
193
|
if num:
|
|
191
194
|
self._post_v1(
|
|
192
195
|
self.settings.url_meta,
|
|
193
196
|
self.headers,
|
|
194
197
|
make_compat_meta_v1(num, 'num', self.settings),
|
|
195
198
|
client=self.client_api,
|
|
199
|
+
suppress_httpx_logs=True,
|
|
196
200
|
)
|
|
197
201
|
if df:
|
|
198
202
|
for type_name, names in df.items():
|
|
@@ -201,6 +205,7 @@ class ServerInterface:
|
|
|
201
205
|
self.headers,
|
|
202
206
|
make_compat_meta_v1(names, type_name, self.settings),
|
|
203
207
|
client=self.client_api,
|
|
208
|
+
suppress_httpx_logs=True,
|
|
204
209
|
)
|
|
205
210
|
|
|
206
211
|
def _log_failed_request(
|
|
@@ -5,9 +5,10 @@ from typing import Any, Dict, Optional, Union
|
|
|
5
5
|
import pluto
|
|
6
6
|
|
|
7
7
|
from . import sentry as _sentry
|
|
8
|
+
from ._fs import get_fs_type, is_network_fs
|
|
8
9
|
from .op import Op
|
|
9
10
|
from .sets import Settings, _classify_run_id, _is_display_id, setup
|
|
10
|
-
from .util import deep_merge, gen_id, get_char
|
|
11
|
+
from .util import deep_merge, gen_id, get_char, to_native_config
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(f'{__name__.split(".")[0]}')
|
|
13
14
|
tag = 'Init'
|
|
@@ -58,6 +59,36 @@ class OpInit:
|
|
|
58
59
|
self.settings = settings
|
|
59
60
|
|
|
60
61
|
|
|
62
|
+
def _warn_if_network_staging_dir(settings: Settings) -> None:
|
|
63
|
+
"""Warn once if the sync DB will live on a network filesystem.
|
|
64
|
+
|
|
65
|
+
WAL-mode SQLite locking is unreliable on NFS/Lustre/SMB and degrades into
|
|
66
|
+
"locking protocol" retries that throttle logging. Detection is best-effort
|
|
67
|
+
and Linux-only; on other platforms this is a no-op (see pluto/_fs.py).
|
|
68
|
+
"""
|
|
69
|
+
# The sync DB lives under the run dir (settings.get_dir()) unless an
|
|
70
|
+
# explicit path override is set.
|
|
71
|
+
db_path = settings.sync_process_db_path
|
|
72
|
+
staging_dir = os.path.dirname(db_path) if db_path else settings.get_dir()
|
|
73
|
+
try:
|
|
74
|
+
if not is_network_fs(staging_dir):
|
|
75
|
+
return
|
|
76
|
+
fstype = get_fs_type(staging_dir) or 'network'
|
|
77
|
+
logger.warning(
|
|
78
|
+
'%s: pluto staging directory %r is on a network filesystem (%s). '
|
|
79
|
+
'WAL-mode SQLite locking is unreliable there and can cause '
|
|
80
|
+
'"locking protocol" retries that slow down logging. Point it at '
|
|
81
|
+
'node-local storage via pluto.init(dir=...) or the PLUTO_DIR '
|
|
82
|
+
'environment variable (e.g. /tmp).',
|
|
83
|
+
tag,
|
|
84
|
+
staging_dir,
|
|
85
|
+
fstype,
|
|
86
|
+
)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
# Detection must never break init().
|
|
89
|
+
logger.debug('%s: network-fs check skipped: %s', tag, e)
|
|
90
|
+
|
|
91
|
+
|
|
61
92
|
def init(
|
|
62
93
|
dir: Optional[str] = None,
|
|
63
94
|
project: Optional[str] = None,
|
|
@@ -165,6 +196,10 @@ def init(
|
|
|
165
196
|
) # datetime.now().strftime("%Y%m%d"), str(int(time.time()))
|
|
166
197
|
# settings._op_id = id if id else gen_id(seed=settings.project)
|
|
167
198
|
|
|
199
|
+
# Warn (once) if the sync DB will live on a network filesystem. Done after
|
|
200
|
+
# project/_op_name are set so get_dir() resolves the real run directory.
|
|
201
|
+
_warn_if_network_staging_dir(settings)
|
|
202
|
+
|
|
168
203
|
# Classify run_id: display ID → resume, numeric → resume, other → externalId
|
|
169
204
|
# Parameter takes precedence over environment variable (already handled in setup())
|
|
170
205
|
if run_id is not None:
|
|
@@ -194,6 +229,12 @@ def init(
|
|
|
194
229
|
if inherit_tags is not None:
|
|
195
230
|
settings._inherit_tags = inherit_tags
|
|
196
231
|
|
|
232
|
+
# Normalize the config to JSON-native types up front (e.g. OmegaConf
|
|
233
|
+
# DictConfig -> dict, resolving interpolations). Done before the fork
|
|
234
|
+
# deep-merge below so its `isinstance(config, dict)` check works, and so
|
|
235
|
+
# everything downstream (storage, serialization) sees clean native data.
|
|
236
|
+
config = to_native_config(config)
|
|
237
|
+
|
|
197
238
|
# Deep-merge inherited parent config with user config (client-side).
|
|
198
239
|
# The server only does a shallow merge, so we fetch the parent config,
|
|
199
240
|
# deep-merge locally, and disable server-side inheritance.
|
|
@@ -240,7 +281,7 @@ def init(
|
|
|
240
281
|
return op
|
|
241
282
|
except Exception as e:
|
|
242
283
|
_sentry.capture_exception(e)
|
|
243
|
-
logger.critical('%s: failed, %s', tag, e) # add early logger
|
|
284
|
+
logger.critical('%s: failed, %s', tag, e, exc_info=True) # add early logger
|
|
244
285
|
raise e
|
|
245
286
|
|
|
246
287
|
|