pluto-ml 0.0.22__tar.gz → 0.0.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/PKG-INFO +1 -1
  2. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/__init__.py +2 -2
  3. pluto_ml-0.0.24/pluto/_fs.py +96 -0
  4. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/api.py +10 -3
  5. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/wandb.py +211 -21
  6. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/file.py +24 -0
  7. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/iface.py +5 -0
  8. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/init.py +43 -2
  9. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/op.py +70 -3
  10. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sets.py +11 -0
  11. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/process.py +16 -10
  12. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/store.py +92 -16
  13. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/util.py +78 -0
  14. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pyproject.toml +3 -1
  15. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/LICENSE +0 -0
  16. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/README.md +0 -0
  17. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/__init__.py +0 -0
  18. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/__main__.py +0 -0
  19. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/__init__.py +0 -0
  20. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/lightning.py +0 -0
  21. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/neptune.py +0 -0
  22. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/torch.py +0 -0
  23. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/mlop/compat/transformers.py +0 -0
  24. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/__main__.py +0 -0
  25. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/_wandb_hook.py +0 -0
  26. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/auth.py +0 -0
  27. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/__init__.py +0 -0
  28. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/_utils.py +0 -0
  29. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/lightning.py +0 -0
  30. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune.py +0 -0
  31. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune_query/__init__.py +0 -0
  32. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune_query/filters.py +0 -0
  33. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/neptune_query/runs.py +0 -0
  34. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/torch.py +0 -0
  35. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/transformers.py +0 -0
  36. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/data.py +0 -0
  37. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/log.py +0 -0
  38. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/query.py +0 -0
  39. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sanitize.py +0 -0
  40. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sentry.py +0 -0
  41. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/store.py +0 -0
  42. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/__init__.py +0 -0
  43. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/__main__.py +0 -0
  44. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sync/retry.py +0 -0
  45. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/sys.py +0 -0
  46. {pluto_ml-0.0.22 → pluto_ml-0.0.24}/zzzz_pluto_wandb_hook.pth +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pluto-ml
3
- Version: 0.0.22
3
+ Version: 0.0.24
4
4
  Summary: Pluto ML - Machine Learning Operations Framework
5
5
  License-File: LICENSE
6
6
  Author: jqssun
@@ -41,11 +41,11 @@ __all__ = (
41
41
  'generate_run_id',
42
42
  )
43
43
 
44
- __version__ = '0.0.22'
44
+ __version__ = '0.0.24'
45
45
 
46
46
 
47
47
  # Replaced with the current commit when building the wheels.
48
- _PLUTO_COMMIT_SHA = '4587211b1c6ccebe92f92d1243bf9a213ec1f3dd'
48
+ _PLUTO_COMMIT_SHA = '83cf832b478eb34d6458bd346b3c29cb6fa71a80'
49
49
 
50
50
 
51
51
  def _get_git_commit():
@@ -0,0 +1,96 @@
1
+ """Filesystem detection helpers.
2
+
3
+ Pluto stages local state — most importantly the WAL-mode SQLite database used
4
+ to hand data from the training process to the sync process — under the run
5
+ directory. SQLite's WAL locking relies on POSIX byte-range locks plus a shared
6
+ -shm mmap, neither of which behaves reliably on network filesystems (NFS,
7
+ Lustre, SMB/CIFS, ...). On those mounts the lock handoff degrades into
8
+ SQLITE_PROTOCOL ("locking protocol") races that show up as repeated lock
9
+ retries and can badly throttle logging.
10
+
11
+ These helpers let `init()` detect that situation up front and tell the user to
12
+ point the staging dir (``pluto.init(dir=...)`` / ``PLUTO_DIR``) at node-local
13
+ storage. Detection is best-effort and Linux-only — on any other platform, or if
14
+ the mount table can't be read, we return ``None``/``False`` and stay silent
15
+ rather than risk a false alarm.
16
+ """
17
+
18
+ import os
19
+ import re
20
+ from typing import Optional
21
+
22
+ # /proc/self/mountinfo escapes space, tab, newline and backslash in paths and
23
+ # fstypes as octal sequences (\040, \011, \012, \134). Decode them so prefix
24
+ # matching works for mount points that contain such characters.
25
+ _OCTAL_ESCAPE = re.compile(r'\\([0-7]{3})')
26
+
27
+
28
+ def _unescape_mountinfo(field: str) -> str:
29
+ return _OCTAL_ESCAPE.sub(lambda m: chr(int(m.group(1), 8)), field)
30
+
31
+
32
+ # Filesystem type prefixes (as reported in /proc/self/mountinfo) whose locking
33
+ # semantics are unreliable for WAL-mode SQLite. Matched as prefixes so that
34
+ # e.g. both "nfs" and "nfs4", or "fuse.sshfs", are covered.
35
+ _NETWORK_FS_PREFIXES = (
36
+ 'nfs',
37
+ 'cifs',
38
+ 'smb',
39
+ 'lustre',
40
+ 'gpfs',
41
+ 'ceph',
42
+ 'glusterfs',
43
+ 'afs',
44
+ 'ncpfs',
45
+ 'fuse.sshfs',
46
+ 'fuse.glusterfs',
47
+ 'beegfs',
48
+ )
49
+
50
+
51
+ def get_fs_type(path: str) -> Optional[str]:
52
+ """Return the filesystem type backing ``path``, or ``None`` if unknown.
53
+
54
+ Linux-only: reads ``/proc/self/mountinfo`` and returns the type of the
55
+ most specific (longest) mount point that is a prefix of ``path``. The path
56
+ need not exist yet — matching is done on the resolved path string, so a
57
+ not-yet-created run directory still resolves to its parent mount. Returns
58
+ ``None`` on non-Linux platforms or if the mount table can't be parsed.
59
+ """
60
+ try:
61
+ target = os.path.realpath(path)
62
+ best_mount = ''
63
+ best_type: Optional[str] = None
64
+ with open('/proc/self/mountinfo', 'r') as f:
65
+ for line in f:
66
+ # Format: "<id> <pid> <maj:min> <root> <mountpoint> <opts> \
67
+ # [optional fields] - <fstype> <source> <superopts>"
68
+ left, sep, right = line.partition(' - ')
69
+ if not sep:
70
+ continue
71
+ left_fields = left.split()
72
+ right_fields = right.split()
73
+ if len(left_fields) < 5 or not right_fields:
74
+ continue
75
+ mount_point = _unescape_mountinfo(left_fields[4])
76
+ fstype = _unescape_mountinfo(right_fields[0])
77
+ # Longest mount point that is a path-prefix of target wins.
78
+ if target == mount_point or target.startswith(
79
+ mount_point.rstrip('/') + '/'
80
+ ):
81
+ if len(mount_point) >= len(best_mount):
82
+ best_mount = mount_point
83
+ best_type = fstype
84
+ return best_type
85
+ except (OSError, ValueError, IndexError):
86
+ return None
87
+
88
+
89
+ def is_network_fs(path: str) -> bool:
90
+ """True if ``path`` appears to live on a network filesystem.
91
+
92
+ Best-effort and conservative: returns ``False`` when the filesystem type
93
+ can't be determined (e.g. non-Linux), so callers never warn spuriously.
94
+ """
95
+ fstype = get_fs_type(path)
96
+ return fstype is not None and fstype.lower().startswith(_NETWORK_FS_PREFIXES)
@@ -4,7 +4,7 @@ import re
4
4
  import signal
5
5
  from datetime import datetime
6
6
 
7
- from .util import clean_dict, find_node
7
+ from .util import clean_dict, config_json_default, find_node
8
8
 
9
9
  logger = logging.getLogger(f'{__name__.split(".")[0]}')
10
10
  tag = 'API'
@@ -40,7 +40,9 @@ def make_compat_start_v1(config, settings, info, tags=None):
40
40
  'runName': settings._op_name,
41
41
  'projectName': settings.project,
42
42
  'externalId': settings._external_id, # User-provided run ID for multi-node
43
- 'config': json.dumps(config) if config is not None else None,
43
+ 'config': json.dumps(config, default=config_json_default)
44
+ if config is not None
45
+ else None,
44
46
  'loggerSettings': json.dumps(clean_dict(settings.to_dict())),
45
47
  'systemMetadata': json.dumps(info) if info is not None else None,
46
48
  'tags': tags if tags else None,
@@ -97,7 +99,9 @@ def make_compat_update_config_v1(settings, config):
97
99
  return json.dumps(
98
100
  {
99
101
  'runId': settings._op_id,
100
- 'config': json.dumps(config) if config else None,
102
+ 'config': json.dumps(config, default=config_json_default)
103
+ if config
104
+ else None,
101
105
  }
102
106
  ).encode()
103
107
 
@@ -166,6 +170,9 @@ def make_compat_file_v1(file, timestamp, step):
166
170
  'logName': k,
167
171
  'step': step,
168
172
  }
173
+ caption = getattr(f, '_caption', None)
174
+ if caption is not None:
175
+ i['caption'] = caption
169
176
  batch.append(i)
170
177
  return json.dumps({'files': batch}).encode()
171
178
 
@@ -41,6 +41,8 @@ Hard Requirements:
41
41
  """
42
42
 
43
43
  import atexit
44
+ import copy
45
+ import json
44
46
  import logging
45
47
  import os
46
48
  import threading
@@ -55,6 +57,9 @@ from ._utils import (
55
57
 
56
58
  logger = logging.getLogger(__name__)
57
59
 
60
+ # Distinct from None so config dedup can tell "never logged" from "logged None".
61
+ _MISSING = object()
62
+
58
63
  _original_wandb_init = None
59
64
  _original_wandb_log = None
60
65
  _original_wandb_finish = None
@@ -91,6 +96,13 @@ class WandbRunWrapper:
91
96
  self._fallback_step = 0 # Used when wandb is disabled (_step won't increment)
92
97
  self._closed = False
93
98
  self._close_lock = threading.Lock()
99
+ # Keys we've already warned about being unforwardable to Pluto, so a
100
+ # value logged every step warns once rather than spamming the logs.
101
+ self._unforwardable_warned: set = set()
102
+ # Last config values we synced to Pluto, keyed by log key. Lets us skip
103
+ # redundant update_config() calls when a str/bool/config value is logged
104
+ # unchanged every step (a common pattern: phase/status/checkpoint paths).
105
+ self._last_logged_config: Dict[str, Any] = {}
94
106
 
95
107
  if self._pluto_run:
96
108
  atexit.register(self._atexit_cleanup_pluto)
@@ -158,7 +170,26 @@ class WandbRunWrapper:
158
170
  logger.debug(f'pluto.compat.wandb: Pluto finish timed out after {timeout}s')
159
171
 
160
172
  def log(self, data: Dict[str, Any], step=None, commit=None, **kwargs):
161
- """Log metrics to both wandb and Pluto."""
173
+ """Log metrics to both wandb and Pluto.
174
+
175
+ Value routing for the Pluto side:
176
+ - int/float and any scalar exposing .item() (numpy/torch/etc.)
177
+ -> Pluto metrics (time-series), matching Pluto core's own log()
178
+ - wandb media (Image/Video/Audio/Histogram/Table), and lists
179
+ thereof -> converted Pluto media
180
+ - str and bool -> Pluto config (latest-wins). Pluto has no
181
+ string/bool time-series metric, so these mirror wandb's
182
+ summary/overview placement and stay queryable via
183
+ get_run().config.
184
+ - anything else with no metric/media mapping -> preserved as
185
+ config if it survives update_config's normalization (incl.
186
+ OmegaConf), otherwise dropped and reported to Sentry telemetry
187
+ once per key (a maintainer-coverage signal, not a user-facing
188
+ warning). See _handle_unforwardable.
189
+
190
+ str/bool/config values are deduped against the last synced value, so
191
+ logging an unchanged value every step doesn't spam update_config.
192
+ """
162
193
  # Determine the step to use for Pluto.
163
194
  # When step is explicit, use it. Otherwise:
164
195
  # - Normal mode: read wandb's _step before log() increments it
@@ -186,11 +217,29 @@ class WandbRunWrapper:
186
217
  # Pluto.log() natively supports lists, so we just need
187
218
  # to convert each element and pass the list through.
188
219
  pluto_data: Dict[str, Any] = {}
220
+ # String values have no time-series metric equivalent in
221
+ # Pluto (op._process_log_item_sync only keeps int/float/
222
+ # tensor/File/Data). wandb puts loose strings in the run
223
+ # summary/overview; the closest Pluto analogue is config,
224
+ # which is latest-wins and queryable via get_run().config.
225
+ # This is what lets e.g. a resume skill read back the most
226
+ # recent checkpoint/r2_path for a run.
227
+ pluto_config: Dict[str, Any] = {}
189
228
  for key, value in data.items():
190
- if isinstance(value, (int, float)):
229
+ if isinstance(value, bool):
230
+ # bool is a subclass of int, but Pluto drops bool
231
+ # metrics — surface it as config so it isn't lost.
232
+ # Skip if unchanged since last log (avoid redundant
233
+ # config writes when logged every step).
234
+ if self._last_logged_config.get(key, _MISSING) != value:
235
+ pluto_config[key] = value
236
+ elif isinstance(value, (int, float)):
191
237
  pluto_data[key] = value
192
- elif _is_torch_tensor_scalar(value):
193
- pluto_data[key] = value.item()
238
+ elif (num := _as_scalar_number(value)) is not None:
239
+ pluto_data[key] = num
240
+ elif isinstance(value, str):
241
+ if self._last_logged_config.get(key, _MISSING) != value:
242
+ pluto_config[key] = value
194
243
  elif isinstance(value, (list, tuple)):
195
244
  # List of wandb media — convert each element.
196
245
  converted_items = []
@@ -200,22 +249,107 @@ class WandbRunWrapper:
200
249
  converted_items.append(c)
201
250
  if converted_items:
202
251
  pluto_data[key] = converted_items
252
+ else:
253
+ # Not a media list (e.g. list of primitives) —
254
+ # preserve as config if possible, else warn.
255
+ self._handle_unforwardable(key, value, pluto_config)
203
256
  else:
204
257
  # Try to convert wandb data types to pluto equivalents
205
258
  converted = _convert_wandb_to_pluto(key, value, self._pluto)
206
259
  if converted is not None:
207
260
  pluto_data[key] = converted
208
-
261
+ else:
262
+ # No metric/media mapping — last-resort handling
263
+ # so the value is never silently dropped.
264
+ self._handle_unforwardable(key, value, pluto_config)
265
+
266
+ # Metrics and config are sent in independent try blocks: a
267
+ # failure logging metrics must NOT skip the config update (or
268
+ # vice versa) — str/bool from the same wandb.log() call live in
269
+ # config and would otherwise be silently lost.
209
270
  if pluto_data:
210
- log_kwargs = {}
211
- if actual_step is not None:
212
- log_kwargs['step'] = actual_step
213
- self._pluto_run.log(pluto_data, **log_kwargs)
271
+ try:
272
+ log_kwargs = {}
273
+ if actual_step is not None:
274
+ log_kwargs['step'] = actual_step
275
+ self._pluto_run.log(pluto_data, **log_kwargs)
276
+ except Exception as e:
277
+ logger.debug(
278
+ f'pluto.compat.wandb: Failed to log metrics to Pluto: {e}'
279
+ )
280
+
281
+ if pluto_config:
282
+ try:
283
+ self._pluto_run.update_config(pluto_config)
284
+ # Only remember as synced once the update succeeds.
285
+ # deepcopy so the dedup snapshot can't share a reference
286
+ # with a caller-owned mutable: today pluto_config holds
287
+ # only immutable str/bool or a fresh to_native_config
288
+ # rebuild, but copying keeps the != comparison correct
289
+ # even if a future branch stores a user object directly.
290
+ self._last_logged_config.update(copy.deepcopy(pluto_config))
291
+ except Exception as e:
292
+ logger.debug(
293
+ f'pluto.compat.wandb: Failed to sync config to Pluto: {e}'
294
+ )
214
295
  except Exception as e:
215
- logger.debug(f'pluto.compat.wandb: Failed to log metrics to Pluto: {e}')
296
+ logger.debug(f'pluto.compat.wandb: Failed to prepare Pluto data: {e}')
216
297
 
217
298
  return result
218
299
 
300
+ def _handle_unforwardable(self, key, value, pluto_config: Dict[str, Any]) -> None:
301
+ """Last-resort handling for a value with no metric/media mapping.
302
+
303
+ Pluto only stores numbers (metrics), media/structured data, and
304
+ config — so values outside those (dicts, None, raw/multi-element
305
+ tensors, numpy arrays, unconvertible wandb media like Html/Object3D,
306
+ custom objects) have nowhere to go. Rather than dropping them
307
+ silently — which is what made missing data so hard to diagnose —
308
+ we:
309
+
310
+ 1. Preserve the value as config if it survives update_config's own
311
+ normalization (mirrors how wandb keeps loose values in the run
312
+ summary). This covers nested dicts/lists of primitives, None, and
313
+ OmegaConf DictConfig/ListConfig nodes (which to_native_config
314
+ deep-converts). Skipped if unchanged since the last log.
315
+ 2. Otherwise drop the Pluto copy (it still reached W&B) and report
316
+ it as a maintainer-coverage signal via Sentry telemetry — once
317
+ per key. This is a gap in OUR type handling, not a user error,
318
+ so we deliberately do NOT emit a user-facing warning: people
319
+ migrating away from wandb shouldn't be nagged about types only
320
+ we can fix. The local log stays at debug for self-host
321
+ debugging.
322
+ """
323
+ storable, native = _config_storable_value(value)
324
+ if storable:
325
+ if self._last_logged_config.get(key, _MISSING) != native:
326
+ pluto_config[key] = native
327
+ return
328
+ if key in self._unforwardable_warned:
329
+ return
330
+ self._unforwardable_warned.add(key)
331
+ type_name = type(value).__name__
332
+ # Quiet locally (debug only) — not a user-actionable problem.
333
+ logger.debug(
334
+ 'pluto.compat.wandb: not forwarding %r to Pluto — type %s has no '
335
+ 'metric/media/config mapping (still logged to W&B).',
336
+ key,
337
+ type_name,
338
+ )
339
+ # Alert us (the maintainers) so we can add coverage for the type.
340
+ # Message is keyed on the type (not the run-specific key) so Sentry
341
+ # groups all occurrences of the same unhandled type together.
342
+ try:
343
+ from pluto import sentry
344
+
345
+ sentry.capture_message(
346
+ f'wandb compat: unforwardable Pluto log value of type '
347
+ f'{type_name!r} (no metric/media/config mapping)',
348
+ level='warning',
349
+ )
350
+ except Exception:
351
+ pass
352
+
219
353
  def finish(self, exit_code=None, quiet=None):
220
354
  """Finish both wandb and Pluto runs."""
221
355
  with self._close_lock:
@@ -498,14 +632,56 @@ def _resolve_wandb_to_pluto_run(wandb_run_id, project):
498
632
  return None
499
633
 
500
634
 
501
- def _is_torch_tensor_scalar(value):
502
- """Check if value is a scalar torch tensor."""
635
+ def _as_scalar_number(value):
636
+ """Return value as a python int/float if it's a scalar number, else None.
637
+
638
+ Mirrors Pluto's own log() (op._process_log_item_sync), which forwards
639
+ anything exposing a callable ``.item()``. The shim previously only
640
+ accepted plain int/float and torch scalar tensors, so a value logged as
641
+ a numpy scalar (``np.int64``), a 0-d numpy array, or a non-torch 0-d
642
+ tensor was dropped here even though Pluto core would have kept it — e.g.
643
+ an ``epoch`` that is ``np.int64`` rather than a plain ``int``.
644
+
645
+ bool and str are excluded (Pluto drops bool metrics; str routes to
646
+ config). ``.item()`` on a multi-element array/tensor raises — we treat
647
+ that as "not a scalar" and return None, same as Pluto would fail it.
648
+ """
649
+ if isinstance(value, (bool, str)):
650
+ return None
651
+ item = getattr(value, 'item', None)
652
+ if not callable(item):
653
+ return None
654
+ try:
655
+ result = item()
656
+ except Exception:
657
+ return None
658
+ if isinstance(result, bool) or not isinstance(result, (int, float)):
659
+ return None
660
+ return result
661
+
662
+
663
+ def _config_storable_value(value):
664
+ """Return ``(storable, native)`` for the config fallback.
665
+
666
+ Mirrors what ``update_config`` actually does — normalize via
667
+ ``to_native_config`` (which deep-converts OmegaConf ``DictConfig`` /
668
+ ``ListConfig`` to native containers), then check JSON-serializability.
669
+ Keeping the gate in lockstep with ``update_config`` means a logged
670
+ ``DictConfig`` is correctly stored as config, even though plain
671
+ ``json.dumps`` would reject it. Tensors / ndarrays / custom objects still
672
+ fail (``to_native_config`` leaves them as-is) and fall through to the
673
+ Sentry path.
674
+
675
+ Returns ``(True, native_value)`` when storable, else ``(False, None)``.
676
+ """
503
677
  try:
504
- import torch
678
+ from pluto.util import to_native_config
505
679
 
506
- return isinstance(value, torch.Tensor) and value.dim() == 0
507
- except ImportError:
508
- return False
680
+ native = to_native_config(value)
681
+ json.dumps(native)
682
+ return True, native
683
+ except Exception:
684
+ return False, None
509
685
 
510
686
 
511
687
  def _is_torch_distributed() -> bool:
@@ -531,6 +707,17 @@ def _is_torch_distributed() -> bool:
531
707
  return False
532
708
 
533
709
 
710
+ def _wandb_caption(value):
711
+ """Extract a user-provided caption from a wandb media object.
712
+
713
+ wandb.Image/Audio/Video store the ``caption=`` kwarg on ``_caption``.
714
+ Returns a non-empty string or None (ignores wandb's list-of-captions
715
+ grouping form, which has no single-file equivalent here).
716
+ """
717
+ cap = getattr(value, '_caption', None)
718
+ return cap if isinstance(cap, str) and cap else None
719
+
720
+
534
721
  def _convert_wandb_to_pluto(key, value, pluto_module):
535
722
  """
536
723
  Convert wandb data types to Pluto equivalents.
@@ -550,15 +737,16 @@ def _convert_wandb_to_pluto(key, value, pluto_module):
550
737
  # which does NOT match subclasses, so we can't pass the PIL
551
738
  # object directly. Instead, use the file path — wandb.Image
552
739
  # always writes to _path on construction.
740
+ caption = _wandb_caption(value)
553
741
  if getattr(value, '_path', None):
554
- return pluto_module.Image(value._path)
742
+ return pluto_module.Image(value._path, caption=caption)
555
743
  # Fallback: convert PIL to numpy (which pluto.Image handles)
556
744
  pil_img = getattr(value, 'image', None) or getattr(value, '_image', None)
557
745
  if pil_img is not None:
558
746
  try:
559
747
  import numpy as np
560
748
 
561
- return pluto_module.Image(np.asarray(pil_img))
749
+ return pluto_module.Image(np.asarray(pil_img), caption=caption)
562
750
  except Exception:
563
751
  return None
564
752
  return None
@@ -578,14 +766,14 @@ def _convert_wandb_to_pluto(key, value, pluto_module):
578
766
  # wandb.Audio always writes to _path on construction
579
767
  # (whether from numpy, file path, or bytes).
580
768
  if getattr(value, '_path', None):
581
- return pluto_module.Audio(value._path)
769
+ return pluto_module.Audio(value._path, caption=_wandb_caption(value))
582
770
  return None
583
771
 
584
772
  if type_name == 'Video':
585
773
  # wandb.Video always writes to _path on construction (after
586
774
  # encoding). This can take a few seconds for numpy input.
587
775
  if getattr(value, '_path', None):
588
- return pluto_module.Video(value._path)
776
+ return pluto_module.Video(value._path, caption=_wandb_caption(value))
589
777
  return None
590
778
 
591
779
  if type_name == 'Table':
@@ -851,7 +1039,9 @@ def _make_patched_init(original_init, wandb_module):
851
1039
  f'wandb will continue to work normally, but NO DATA will be '
852
1040
  f'sent to Pluto. To fix, resolve the error above and retry.'
853
1041
  )
854
- logger.error(_msg)
1042
+ # exc_info=True attaches the traceback so the log points at the
1043
+ # raise site (e.g. the failing json.dumps), not just this handler.
1044
+ logger.error(_msg, exc_info=True)
855
1045
  # Also print to stderr so it shows up even if logging is not configured
856
1046
  import sys
857
1047
 
@@ -23,6 +23,10 @@ INVALID_CHAR = re.compile(r'[^a-zA-Z0-9_\-.]')
23
23
 
24
24
  class File:
25
25
  tag = tag
26
+ # Optional user-provided caption (e.g. Image(caption=...)). Media subclasses
27
+ # override this instance attribute in their __init__; the class-level default
28
+ # ensures it always exists (e.g. on a directly-constructed File).
29
+ _caption: Optional[str] = None
26
30
 
27
31
  def __init__(
28
32
  self,
@@ -112,6 +116,10 @@ class Artifact(File):
112
116
  self._tmp: Optional[str] = None
113
117
  self._ext: str = ''
114
118
  self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
119
+ # Preserve the raw caption separately so it can be sent to the server
120
+ # as a dedicated field (mlop_files.caption); _name keeps the legacy
121
+ # caption-as-filename behavior for back-compat with older servers.
122
+ self._caption = caption
115
123
  self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
116
124
 
117
125
  self._metadata: Dict[str, Any] = metadata or {}
@@ -148,6 +156,10 @@ class Text(File):
148
156
 
149
157
  def __init__(self, data: Union[str, Any], caption: Optional[str] = None) -> None:
150
158
  self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
159
+ # Preserve the raw caption separately so it can be sent to the server
160
+ # as a dedicated field (mlop_files.caption); _name keeps the legacy
161
+ # caption-as-filename behavior for back-compat with older servers.
162
+ self._caption = caption
151
163
  self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
152
164
  self._ext = '.txt'
153
165
  self._path: Optional[str] = None
@@ -182,6 +194,10 @@ class Image(File):
182
194
  caption: Optional[str] = None,
183
195
  ) -> None:
184
196
  self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
197
+ # Preserve the raw caption separately so it can be sent to the server
198
+ # as a dedicated field (mlop_files.caption); _name keeps the legacy
199
+ # caption-as-filename behavior for back-compat with older servers.
200
+ self._caption = caption
185
201
  self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
186
202
  self._ext = '.png'
187
203
  self._image: Any = None
@@ -252,6 +268,10 @@ class Audio(File):
252
268
  rate = kwargs.get('sample_rate', rate)
253
269
 
254
270
  self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
271
+ # Preserve the raw caption separately so it can be sent to the server
272
+ # as a dedicated field (mlop_files.caption); _name keeps the legacy
273
+ # caption-as-filename behavior for back-compat with older servers.
274
+ self._caption = caption
255
275
  self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
256
276
  self._ext = '.wav'
257
277
  self._audio: Any
@@ -305,6 +325,10 @@ class Video(File):
305
325
  rate = kwargs.get('fps', rate)
306
326
 
307
327
  self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
328
+ # Preserve the raw caption separately so it can be sent to the server
329
+ # as a dedicated field (mlop_files.caption); _name keeps the legacy
330
+ # caption-as-filename behavior for back-compat with older servers.
331
+ self._caption = caption
308
332
  self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
309
333
  self._ext = f'.{format}' if format in ['mp4', 'webm', 'ogg', 'gif'] else '.mp4'
310
334
  self._path: Optional[str] = None
@@ -187,12 +187,16 @@ class ServerInterface:
187
187
  num: List of numeric metric names
188
188
  df: Dict mapping file type names to lists of log names
189
189
  """
190
+ # Suppress the per-request httpx INFO line ("HTTP Request: POST
191
+ # .../api/runs/logName/add ..."). One POST fires per new metric/file
192
+ # name, so this is noisy; the heartbeat/status path suppresses it too.
190
193
  if num:
191
194
  self._post_v1(
192
195
  self.settings.url_meta,
193
196
  self.headers,
194
197
  make_compat_meta_v1(num, 'num', self.settings),
195
198
  client=self.client_api,
199
+ suppress_httpx_logs=True,
196
200
  )
197
201
  if df:
198
202
  for type_name, names in df.items():
@@ -201,6 +205,7 @@ class ServerInterface:
201
205
  self.headers,
202
206
  make_compat_meta_v1(names, type_name, self.settings),
203
207
  client=self.client_api,
208
+ suppress_httpx_logs=True,
204
209
  )
205
210
 
206
211
  def _log_failed_request(
@@ -5,9 +5,10 @@ from typing import Any, Dict, Optional, Union
5
5
  import pluto
6
6
 
7
7
  from . import sentry as _sentry
8
+ from ._fs import get_fs_type, is_network_fs
8
9
  from .op import Op
9
10
  from .sets import Settings, _classify_run_id, _is_display_id, setup
10
- from .util import deep_merge, gen_id, get_char
11
+ from .util import deep_merge, gen_id, get_char, to_native_config
11
12
 
12
13
  logger = logging.getLogger(f'{__name__.split(".")[0]}')
13
14
  tag = 'Init'
@@ -58,6 +59,36 @@ class OpInit:
58
59
  self.settings = settings
59
60
 
60
61
 
62
+ def _warn_if_network_staging_dir(settings: Settings) -> None:
63
+ """Warn once if the sync DB will live on a network filesystem.
64
+
65
+ WAL-mode SQLite locking is unreliable on NFS/Lustre/SMB and degrades into
66
+ "locking protocol" retries that throttle logging. Detection is best-effort
67
+ and Linux-only; on other platforms this is a no-op (see pluto/_fs.py).
68
+ """
69
+ # The sync DB lives under the run dir (settings.get_dir()) unless an
70
+ # explicit path override is set.
71
+ db_path = settings.sync_process_db_path
72
+ staging_dir = os.path.dirname(db_path) if db_path else settings.get_dir()
73
+ try:
74
+ if not is_network_fs(staging_dir):
75
+ return
76
+ fstype = get_fs_type(staging_dir) or 'network'
77
+ logger.warning(
78
+ '%s: pluto staging directory %r is on a network filesystem (%s). '
79
+ 'WAL-mode SQLite locking is unreliable there and can cause '
80
+ '"locking protocol" retries that slow down logging. Point it at '
81
+ 'node-local storage via pluto.init(dir=...) or the PLUTO_DIR '
82
+ 'environment variable (e.g. /tmp).',
83
+ tag,
84
+ staging_dir,
85
+ fstype,
86
+ )
87
+ except Exception as e:
88
+ # Detection must never break init().
89
+ logger.debug('%s: network-fs check skipped: %s', tag, e)
90
+
91
+
61
92
  def init(
62
93
  dir: Optional[str] = None,
63
94
  project: Optional[str] = None,
@@ -165,6 +196,10 @@ def init(
165
196
  ) # datetime.now().strftime("%Y%m%d"), str(int(time.time()))
166
197
  # settings._op_id = id if id else gen_id(seed=settings.project)
167
198
 
199
+ # Warn (once) if the sync DB will live on a network filesystem. Done after
200
+ # project/_op_name are set so get_dir() resolves the real run directory.
201
+ _warn_if_network_staging_dir(settings)
202
+
168
203
  # Classify run_id: display ID → resume, numeric → resume, other → externalId
169
204
  # Parameter takes precedence over environment variable (already handled in setup())
170
205
  if run_id is not None:
@@ -194,6 +229,12 @@ def init(
194
229
  if inherit_tags is not None:
195
230
  settings._inherit_tags = inherit_tags
196
231
 
232
+ # Normalize the config to JSON-native types up front (e.g. OmegaConf
233
+ # DictConfig -> dict, resolving interpolations). Done before the fork
234
+ # deep-merge below so its `isinstance(config, dict)` check works, and so
235
+ # everything downstream (storage, serialization) sees clean native data.
236
+ config = to_native_config(config)
237
+
197
238
  # Deep-merge inherited parent config with user config (client-side).
198
239
  # The server only does a shallow merge, so we fetch the parent config,
199
240
  # deep-merge locally, and disable server-side inheritance.
@@ -240,7 +281,7 @@ def init(
240
281
  return op
241
282
  except Exception as e:
242
283
  _sentry.capture_exception(e)
243
- logger.critical('%s: failed, %s', tag, e) # add early logger
284
+ logger.critical('%s: failed, %s', tag, e, exc_info=True) # add early logger
244
285
  raise e
245
286
 
246
287