PyPI - pluto-ml - Versions diffs - 0.0.22__tar.gz → 0.0.24__tar.gz - Mend

pluto-ml 0.0.22tar.gz → 0.0.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pluto-ml
-Version: 0.0.22
+Version: 0.0.24
 Summary: Pluto ML - Machine Learning Operations Framework
 License-File: LICENSE
 Author: jqssun

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/__init__.py RENAMED Viewed

@@ -41,11 +41,11 @@ __all__ = (
     'generate_run_id',
 )
-__version__ = '0.0.22'
+__version__ = '0.0.24'
 # Replaced with the current commit when building the wheels.
-_PLUTO_COMMIT_SHA = '4587211b1c6ccebe92f92d1243bf9a213ec1f3dd'
+_PLUTO_COMMIT_SHA = '83cf832b478eb34d6458bd346b3c29cb6fa71a80'
 def _get_git_commit():

pluto_ml-0.0.24/pluto/_fs.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""Filesystem detection helpers.
+Pluto stages local state — most importantly the WAL-mode SQLite database used
+to hand data from the training process to the sync process — under the run
+directory. SQLite's WAL locking relies on POSIX byte-range locks plus a shared
+-shm mmap, neither of which behaves reliably on network filesystems (NFS,
+Lustre, SMB/CIFS, ...). On those mounts the lock handoff degrades into
+SQLITE_PROTOCOL ("locking protocol") races that show up as repeated lock
+retries and can badly throttle logging.
+These helpers let `init()` detect that situation up front and tell the user to
+point the staging dir (``pluto.init(dir=...)`` / ``PLUTO_DIR``) at node-local
+storage. Detection is best-effort and Linux-only — on any other platform, or if
+the mount table can't be read, we return ``None``/``False`` and stay silent
+rather than risk a false alarm.
+"""
+import os
+import re
+from typing import Optional
+# /proc/self/mountinfo escapes space, tab, newline and backslash in paths and
+# fstypes as octal sequences (\040, \011, \012, \134). Decode them so prefix
+# matching works for mount points that contain such characters.
+_OCTAL_ESCAPE = re.compile(r'\\([0-7]{3})')
+def _unescape_mountinfo(field: str) -> str:
+    return _OCTAL_ESCAPE.sub(lambda m: chr(int(m.group(1), 8)), field)
+# Filesystem type prefixes (as reported in /proc/self/mountinfo) whose locking
+# semantics are unreliable for WAL-mode SQLite. Matched as prefixes so that
+# e.g. both "nfs" and "nfs4", or "fuse.sshfs", are covered.
+_NETWORK_FS_PREFIXES = (
+    'nfs',
+    'cifs',
+    'smb',
+    'lustre',
+    'gpfs',
+    'ceph',
+    'glusterfs',
+    'afs',
+    'ncpfs',
+    'fuse.sshfs',
+    'fuse.glusterfs',
+    'beegfs',
+)
+def get_fs_type(path: str) -> Optional[str]:
+    """Return the filesystem type backing ``path``, or ``None`` if unknown.
+    Linux-only: reads ``/proc/self/mountinfo`` and returns the type of the
+    most specific (longest) mount point that is a prefix of ``path``. The path
+    need not exist yet — matching is done on the resolved path string, so a
+    not-yet-created run directory still resolves to its parent mount. Returns
+    ``None`` on non-Linux platforms or if the mount table can't be parsed.
+    """
+    try:
+        target = os.path.realpath(path)
+        best_mount = ''
+        best_type: Optional[str] = None
+        with open('/proc/self/mountinfo', 'r') as f:
+            for line in f:
+                # Format: "<id> <pid> <maj:min> <root> <mountpoint> <opts> \
+                #          [optional fields] - <fstype> <source> <superopts>"
+                left, sep, right = line.partition(' - ')
+                if not sep:
+                    continue
+                left_fields = left.split()
+                right_fields = right.split()
+                if len(left_fields) < 5 or not right_fields:
+                    continue
+                mount_point = _unescape_mountinfo(left_fields[4])
+                fstype = _unescape_mountinfo(right_fields[0])
+                # Longest mount point that is a path-prefix of target wins.
+                if target == mount_point or target.startswith(
+                    mount_point.rstrip('/') + '/'
+                ):
+                    if len(mount_point) >= len(best_mount):
+                        best_mount = mount_point
+                        best_type = fstype
+        return best_type
+    except (OSError, ValueError, IndexError):
+        return None
+def is_network_fs(path: str) -> bool:
+    """True if ``path`` appears to live on a network filesystem.
+    Best-effort and conservative: returns ``False`` when the filesystem type
+    can't be determined (e.g. non-Linux), so callers never warn spuriously.
+    """
+    fstype = get_fs_type(path)
+    return fstype is not None and fstype.lower().startswith(_NETWORK_FS_PREFIXES)

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/api.py RENAMED Viewed

@@ -4,7 +4,7 @@ import re
 import signal
 from datetime import datetime
-from .util import clean_dict, find_node
+from .util import clean_dict, config_json_default, find_node
 logger = logging.getLogger(f'{__name__.split(".")[0]}')
 tag = 'API'
@@ -40,7 +40,9 @@ def make_compat_start_v1(config, settings, info, tags=None):
         'runName': settings._op_name,
         'projectName': settings.project,
         'externalId': settings._external_id,  # User-provided run ID for multi-node
-        'config': json.dumps(config) if config is not None else None,
+        'config': json.dumps(config, default=config_json_default)
+        if config is not None
+        else None,
         'loggerSettings': json.dumps(clean_dict(settings.to_dict())),
         'systemMetadata': json.dumps(info) if info is not None else None,
         'tags': tags if tags else None,
@@ -97,7 +99,9 @@ def make_compat_update_config_v1(settings, config):
     return json.dumps(
         {
             'runId': settings._op_id,
-            'config': json.dumps(config) if config else None,
+            'config': json.dumps(config, default=config_json_default)
+            if config
+            else None,
         }
     ).encode()
@@ -166,6 +170,9 @@ def make_compat_file_v1(file, timestamp, step):
                 'logName': k,
                 'step': step,
             }
+            caption = getattr(f, '_caption', None)
+            if caption is not None:
+                i['caption'] = caption
             batch.append(i)
     return json.dumps({'files': batch}).encode()

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/compat/wandb.py RENAMED Viewed

@@ -41,6 +41,8 @@ Hard Requirements:
 """
 import atexit
+import copy
+import json
 import logging
 import os
 import threading
@@ -55,6 +57,9 @@ from ._utils import (
 logger = logging.getLogger(__name__)
+# Distinct from None so config dedup can tell "never logged" from "logged None".
+_MISSING = object()
 _original_wandb_init = None
 _original_wandb_log = None
 _original_wandb_finish = None
@@ -91,6 +96,13 @@ class WandbRunWrapper:
         self._fallback_step = 0  # Used when wandb is disabled (_step won't increment)
         self._closed = False
         self._close_lock = threading.Lock()
+        # Keys we've already warned about being unforwardable to Pluto, so a
+        # value logged every step warns once rather than spamming the logs.
+        self._unforwardable_warned: set = set()
+        # Last config values we synced to Pluto, keyed by log key. Lets us skip
+        # redundant update_config() calls when a str/bool/config value is logged
+        # unchanged every step (a common pattern: phase/status/checkpoint paths).
+        self._last_logged_config: Dict[str, Any] = {}
         if self._pluto_run:
             atexit.register(self._atexit_cleanup_pluto)
@@ -158,7 +170,26 @@ class WandbRunWrapper:
             logger.debug(f'pluto.compat.wandb: Pluto finish timed out after {timeout}s')
     def log(self, data: Dict[str, Any], step=None, commit=None, **kwargs):
-        """Log metrics to both wandb and Pluto."""
+        """Log metrics to both wandb and Pluto.
+        Value routing for the Pluto side:
+        - int/float and any scalar exposing .item() (numpy/torch/etc.)
+          -> Pluto metrics (time-series), matching Pluto core's own log()
+        - wandb media (Image/Video/Audio/Histogram/Table), and lists
+          thereof -> converted Pluto media
+        - str and bool -> Pluto config (latest-wins). Pluto has no
+          string/bool time-series metric, so these mirror wandb's
+          summary/overview placement and stay queryable via
+          get_run().config.
+        - anything else with no metric/media mapping -> preserved as
+          config if it survives update_config's normalization (incl.
+          OmegaConf), otherwise dropped and reported to Sentry telemetry
+          once per key (a maintainer-coverage signal, not a user-facing
+          warning). See _handle_unforwardable.
+        str/bool/config values are deduped against the last synced value, so
+        logging an unchanged value every step doesn't spam update_config.
+        """
         # Determine the step to use for Pluto.
         # When step is explicit, use it. Otherwise:
         # - Normal mode: read wandb's _step before log() increments it
@@ -186,11 +217,29 @@ class WandbRunWrapper:
                 # Pluto.log() natively supports lists, so we just need
                 # to convert each element and pass the list through.
                 pluto_data: Dict[str, Any] = {}
+                # String values have no time-series metric equivalent in
+                # Pluto (op._process_log_item_sync only keeps int/float/
+                # tensor/File/Data). wandb puts loose strings in the run
+                # summary/overview; the closest Pluto analogue is config,
+                # which is latest-wins and queryable via get_run().config.
+                # This is what lets e.g. a resume skill read back the most
+                # recent checkpoint/r2_path for a run.
+                pluto_config: Dict[str, Any] = {}
                 for key, value in data.items():
-                    if isinstance(value, (int, float)):
+                    if isinstance(value, bool):
+                        # bool is a subclass of int, but Pluto drops bool
+                        # metrics — surface it as config so it isn't lost.
+                        # Skip if unchanged since last log (avoid redundant
+                        # config writes when logged every step).
+                        if self._last_logged_config.get(key, _MISSING) != value:
+                            pluto_config[key] = value
+                    elif isinstance(value, (int, float)):
                         pluto_data[key] = value
-                    elif _is_torch_tensor_scalar(value):
-                        pluto_data[key] = value.item()
+                    elif (num := _as_scalar_number(value)) is not None:
+                        pluto_data[key] = num
+                    elif isinstance(value, str):
+                        if self._last_logged_config.get(key, _MISSING) != value:
+                            pluto_config[key] = value
                     elif isinstance(value, (list, tuple)):
                         # List of wandb media — convert each element.
                         converted_items = []
@@ -200,22 +249,107 @@ class WandbRunWrapper:
                                 converted_items.append(c)
                         if converted_items:
                             pluto_data[key] = converted_items
+                        else:
+                            # Not a media list (e.g. list of primitives) —
+                            # preserve as config if possible, else warn.
+                            self._handle_unforwardable(key, value, pluto_config)
                     else:
                         # Try to convert wandb data types to pluto equivalents
                         converted = _convert_wandb_to_pluto(key, value, self._pluto)
                         if converted is not None:
                             pluto_data[key] = converted
+                        else:
+                            # No metric/media mapping — last-resort handling
+                            # so the value is never silently dropped.
+                            self._handle_unforwardable(key, value, pluto_config)
+                # Metrics and config are sent in independent try blocks: a
+                # failure logging metrics must NOT skip the config update (or
+                # vice versa) — str/bool from the same wandb.log() call live in
+                # config and would otherwise be silently lost.
                 if pluto_data:
-                    log_kwargs = {}
-                    if actual_step is not None:
-                        log_kwargs['step'] = actual_step
-                    self._pluto_run.log(pluto_data, **log_kwargs)
+                    try:
+                        log_kwargs = {}
+                        if actual_step is not None:
+                            log_kwargs['step'] = actual_step
+                        self._pluto_run.log(pluto_data, **log_kwargs)
+                    except Exception as e:
+                        logger.debug(
+                            f'pluto.compat.wandb: Failed to log metrics to Pluto: {e}'
+                        )
+                if pluto_config:
+                    try:
+                        self._pluto_run.update_config(pluto_config)
+                        # Only remember as synced once the update succeeds.
+                        # deepcopy so the dedup snapshot can't share a reference
+                        # with a caller-owned mutable: today pluto_config holds
+                        # only immutable str/bool or a fresh to_native_config
+                        # rebuild, but copying keeps the != comparison correct
+                        # even if a future branch stores a user object directly.
+                        self._last_logged_config.update(copy.deepcopy(pluto_config))
+                    except Exception as e:
+                        logger.debug(
+                            f'pluto.compat.wandb: Failed to sync config to Pluto: {e}'
+                        )
             except Exception as e:
-                logger.debug(f'pluto.compat.wandb: Failed to log metrics to Pluto: {e}')
+                logger.debug(f'pluto.compat.wandb: Failed to prepare Pluto data: {e}')
         return result
+    def _handle_unforwardable(self, key, value, pluto_config: Dict[str, Any]) -> None:
+        """Last-resort handling for a value with no metric/media mapping.
+        Pluto only stores numbers (metrics), media/structured data, and
+        config — so values outside those (dicts, None, raw/multi-element
+        tensors, numpy arrays, unconvertible wandb media like Html/Object3D,
+        custom objects) have nowhere to go. Rather than dropping them
+        silently — which is what made missing data so hard to diagnose —
+        we:
+        1. Preserve the value as config if it survives update_config's own
+           normalization (mirrors how wandb keeps loose values in the run
+           summary). This covers nested dicts/lists of primitives, None, and
+           OmegaConf DictConfig/ListConfig nodes (which to_native_config
+           deep-converts). Skipped if unchanged since the last log.
+        2. Otherwise drop the Pluto copy (it still reached W&B) and report
+           it as a maintainer-coverage signal via Sentry telemetry — once
+           per key. This is a gap in OUR type handling, not a user error,
+           so we deliberately do NOT emit a user-facing warning: people
+           migrating away from wandb shouldn't be nagged about types only
+           we can fix. The local log stays at debug for self-host
+           debugging.
+        """
+        storable, native = _config_storable_value(value)
+        if storable:
+            if self._last_logged_config.get(key, _MISSING) != native:
+                pluto_config[key] = native
+            return
+        if key in self._unforwardable_warned:
+            return
+        self._unforwardable_warned.add(key)
+        type_name = type(value).__name__
+        # Quiet locally (debug only) — not a user-actionable problem.
+        logger.debug(
+            'pluto.compat.wandb: not forwarding %r to Pluto — type %s has no '
+            'metric/media/config mapping (still logged to W&B).',
+            key,
+            type_name,
+        )
+        # Alert us (the maintainers) so we can add coverage for the type.
+        # Message is keyed on the type (not the run-specific key) so Sentry
+        # groups all occurrences of the same unhandled type together.
+        try:
+            from pluto import sentry
+            sentry.capture_message(
+                f'wandb compat: unforwardable Pluto log value of type '
+                f'{type_name!r} (no metric/media/config mapping)',
+                level='warning',
+            )
+        except Exception:
+            pass
     def finish(self, exit_code=None, quiet=None):
         """Finish both wandb and Pluto runs."""
         with self._close_lock:
@@ -498,14 +632,56 @@ def _resolve_wandb_to_pluto_run(wandb_run_id, project):
     return None
-def _is_torch_tensor_scalar(value):
-    """Check if value is a scalar torch tensor."""
+def _as_scalar_number(value):
+    """Return value as a python int/float if it's a scalar number, else None.
+    Mirrors Pluto's own log() (op._process_log_item_sync), which forwards
+    anything exposing a callable ``.item()``. The shim previously only
+    accepted plain int/float and torch scalar tensors, so a value logged as
+    a numpy scalar (``np.int64``), a 0-d numpy array, or a non-torch 0-d
+    tensor was dropped here even though Pluto core would have kept it — e.g.
+    an ``epoch`` that is ``np.int64`` rather than a plain ``int``.
+    bool and str are excluded (Pluto drops bool metrics; str routes to
+    config). ``.item()`` on a multi-element array/tensor raises — we treat
+    that as "not a scalar" and return None, same as Pluto would fail it.
+    """
+    if isinstance(value, (bool, str)):
+        return None
+    item = getattr(value, 'item', None)
+    if not callable(item):
+        return None
+    try:
+        result = item()
+    except Exception:
+        return None
+    if isinstance(result, bool) or not isinstance(result, (int, float)):
+        return None
+    return result
+def _config_storable_value(value):
+    """Return ``(storable, native)`` for the config fallback.
+    Mirrors what ``update_config`` actually does — normalize via
+    ``to_native_config`` (which deep-converts OmegaConf ``DictConfig`` /
+    ``ListConfig`` to native containers), then check JSON-serializability.
+    Keeping the gate in lockstep with ``update_config`` means a logged
+    ``DictConfig`` is correctly stored as config, even though plain
+    ``json.dumps`` would reject it. Tensors / ndarrays / custom objects still
+    fail (``to_native_config`` leaves them as-is) and fall through to the
+    Sentry path.
+    Returns ``(True, native_value)`` when storable, else ``(False, None)``.
+    """
     try:
-        import torch
+        from pluto.util import to_native_config
-        return isinstance(value, torch.Tensor) and value.dim() == 0
-    except ImportError:
-        return False
+        native = to_native_config(value)
+        json.dumps(native)
+        return True, native
+    except Exception:
+        return False, None
 def _is_torch_distributed() -> bool:
@@ -531,6 +707,17 @@ def _is_torch_distributed() -> bool:
         return False
+def _wandb_caption(value):
+    """Extract a user-provided caption from a wandb media object.
+    wandb.Image/Audio/Video store the ``caption=`` kwarg on ``_caption``.
+    Returns a non-empty string or None (ignores wandb's list-of-captions
+    grouping form, which has no single-file equivalent here).
+    """
+    cap = getattr(value, '_caption', None)
+    return cap if isinstance(cap, str) and cap else None
 def _convert_wandb_to_pluto(key, value, pluto_module):
     """
     Convert wandb data types to Pluto equivalents.
@@ -550,15 +737,16 @@ def _convert_wandb_to_pluto(key, value, pluto_module):
             # which does NOT match subclasses, so we can't pass the PIL
             # object directly. Instead, use the file path — wandb.Image
             # always writes to _path on construction.
+            caption = _wandb_caption(value)
             if getattr(value, '_path', None):
-                return pluto_module.Image(value._path)
+                return pluto_module.Image(value._path, caption=caption)
             # Fallback: convert PIL to numpy (which pluto.Image handles)
             pil_img = getattr(value, 'image', None) or getattr(value, '_image', None)
             if pil_img is not None:
                 try:
                     import numpy as np
-                    return pluto_module.Image(np.asarray(pil_img))
+                    return pluto_module.Image(np.asarray(pil_img), caption=caption)
                 except Exception:
                     return None
             return None
@@ -578,14 +766,14 @@ def _convert_wandb_to_pluto(key, value, pluto_module):
             # wandb.Audio always writes to _path on construction
             # (whether from numpy, file path, or bytes).
             if getattr(value, '_path', None):
-                return pluto_module.Audio(value._path)
+                return pluto_module.Audio(value._path, caption=_wandb_caption(value))
             return None
         if type_name == 'Video':
             # wandb.Video always writes to _path on construction (after
             # encoding). This can take a few seconds for numpy input.
             if getattr(value, '_path', None):
-                return pluto_module.Video(value._path)
+                return pluto_module.Video(value._path, caption=_wandb_caption(value))
             return None
         if type_name == 'Table':
@@ -851,7 +1039,9 @@ def _make_patched_init(original_init, wandb_module):
                 f'wandb will continue to work normally, but NO DATA will be '
                 f'sent to Pluto. To fix, resolve the error above and retry.'
             )
-            logger.error(_msg)
+            # exc_info=True attaches the traceback so the log points at the
+            # raise site (e.g. the failing json.dumps), not just this handler.
+            logger.error(_msg, exc_info=True)
             # Also print to stderr so it shows up even if logging is not configured
             import sys

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/file.py RENAMED Viewed

@@ -23,6 +23,10 @@ INVALID_CHAR = re.compile(r'[^a-zA-Z0-9_\-.]')
 class File:
     tag = tag
+    # Optional user-provided caption (e.g. Image(caption=...)). Media subclasses
+    # override this instance attribute in their __init__; the class-level default
+    # ensures it always exists (e.g. on a directly-constructed File).
+    _caption: Optional[str] = None
     def __init__(
         self,
@@ -112,6 +116,10 @@ class Artifact(File):
         self._tmp: Optional[str] = None
         self._ext: str = ''
         self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
+        # Preserve the raw caption separately so it can be sent to the server
+        # as a dedicated field (mlop_files.caption); _name keeps the legacy
+        # caption-as-filename behavior for back-compat with older servers.
+        self._caption = caption
         self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
         self._metadata: Dict[str, Any] = metadata or {}
@@ -148,6 +156,10 @@ class Text(File):
     def __init__(self, data: Union[str, Any], caption: Optional[str] = None) -> None:
         self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
+        # Preserve the raw caption separately so it can be sent to the server
+        # as a dedicated field (mlop_files.caption); _name keeps the legacy
+        # caption-as-filename behavior for back-compat with older servers.
+        self._caption = caption
         self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
         self._ext = '.txt'
         self._path: Optional[str] = None
@@ -182,6 +194,10 @@ class Image(File):
         caption: Optional[str] = None,
     ) -> None:
         self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
+        # Preserve the raw caption separately so it can be sent to the server
+        # as a dedicated field (mlop_files.caption); _name keeps the legacy
+        # caption-as-filename behavior for back-compat with older servers.
+        self._caption = caption
         self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
         self._ext = '.png'
         self._image: Any = None
@@ -252,6 +268,10 @@ class Audio(File):
         rate = kwargs.get('sample_rate', rate)
         self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
+        # Preserve the raw caption separately so it can be sent to the server
+        # as a dedicated field (mlop_files.caption); _name keeps the legacy
+        # caption-as-filename behavior for back-compat with older servers.
+        self._caption = caption
         self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
         self._ext = '.wav'
         self._audio: Any
@@ -305,6 +325,10 @@ class Video(File):
         rate = kwargs.get('fps', rate)
         self._name = caption + f'.{uuid.uuid4()}' if caption else f'{uuid.uuid4()}'
+        # Preserve the raw caption separately so it can be sent to the server
+        # as a dedicated field (mlop_files.caption); _name keeps the legacy
+        # caption-as-filename behavior for back-compat with older servers.
+        self._caption = caption
         self._id = f'{uuid.uuid4()}{uuid.uuid4()}'.replace('-', '')
         self._ext = f'.{format}' if format in ['mp4', 'webm', 'ogg', 'gif'] else '.mp4'
         self._path: Optional[str] = None

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/iface.py RENAMED Viewed

@@ -187,12 +187,16 @@ class ServerInterface:
             num: List of numeric metric names
             df: Dict mapping file type names to lists of log names
         """
+        # Suppress the per-request httpx INFO line ("HTTP Request: POST
+        # .../api/runs/logName/add ..."). One POST fires per new metric/file
+        # name, so this is noisy; the heartbeat/status path suppresses it too.
         if num:
             self._post_v1(
                 self.settings.url_meta,
                 self.headers,
                 make_compat_meta_v1(num, 'num', self.settings),
                 client=self.client_api,
+                suppress_httpx_logs=True,
             )
         if df:
             for type_name, names in df.items():
@@ -201,6 +205,7 @@ class ServerInterface:
                     self.headers,
                     make_compat_meta_v1(names, type_name, self.settings),
                     client=self.client_api,
+                    suppress_httpx_logs=True,
                 )
     def _log_failed_request(

{pluto_ml-0.0.22 → pluto_ml-0.0.24}/pluto/init.py RENAMED Viewed

@@ -5,9 +5,10 @@ from typing import Any, Dict, Optional, Union
 import pluto
 from . import sentry as _sentry
+from ._fs import get_fs_type, is_network_fs
 from .op import Op
 from .sets import Settings, _classify_run_id, _is_display_id, setup
-from .util import deep_merge, gen_id, get_char
+from .util import deep_merge, gen_id, get_char, to_native_config
 logger = logging.getLogger(f'{__name__.split(".")[0]}')
 tag = 'Init'
@@ -58,6 +59,36 @@ class OpInit:
         self.settings = settings
+def _warn_if_network_staging_dir(settings: Settings) -> None:
+    """Warn once if the sync DB will live on a network filesystem.
+    WAL-mode SQLite locking is unreliable on NFS/Lustre/SMB and degrades into
+    "locking protocol" retries that throttle logging. Detection is best-effort
+    and Linux-only; on other platforms this is a no-op (see pluto/_fs.py).
+    """
+    # The sync DB lives under the run dir (settings.get_dir()) unless an
+    # explicit path override is set.
+    db_path = settings.sync_process_db_path
+    staging_dir = os.path.dirname(db_path) if db_path else settings.get_dir()
+    try:
+        if not is_network_fs(staging_dir):
+            return
+        fstype = get_fs_type(staging_dir) or 'network'
+        logger.warning(
+            '%s: pluto staging directory %r is on a network filesystem (%s). '
+            'WAL-mode SQLite locking is unreliable there and can cause '
+            '"locking protocol" retries that slow down logging. Point it at '
+            'node-local storage via pluto.init(dir=...) or the PLUTO_DIR '
+            'environment variable (e.g. /tmp).',
+            tag,
+            staging_dir,
+            fstype,
+        )
+    except Exception as e:
+        # Detection must never break init().
+        logger.debug('%s: network-fs check skipped: %s', tag, e)
 def init(
     dir: Optional[str] = None,
     project: Optional[str] = None,
@@ -165,6 +196,10 @@ def init(
     )  # datetime.now().strftime("%Y%m%d"), str(int(time.time()))
     # settings._op_id = id if id else gen_id(seed=settings.project)
+    # Warn (once) if the sync DB will live on a network filesystem. Done after
+    # project/_op_name are set so get_dir() resolves the real run directory.
+    _warn_if_network_staging_dir(settings)
     # Classify run_id: display ID → resume, numeric → resume, other → externalId
     # Parameter takes precedence over environment variable (already handled in setup())
     if run_id is not None:
@@ -194,6 +229,12 @@ def init(
     if inherit_tags is not None:
         settings._inherit_tags = inherit_tags
+    # Normalize the config to JSON-native types up front (e.g. OmegaConf
+    # DictConfig -> dict, resolving interpolations). Done before the fork
+    # deep-merge below so its `isinstance(config, dict)` check works, and so
+    # everything downstream (storage, serialization) sees clean native data.
+    config = to_native_config(config)
     # Deep-merge inherited parent config with user config (client-side).
     # The server only does a shallow merge, so we fetch the parent config,
     # deep-merge locally, and disable server-side inheritance.
@@ -240,7 +281,7 @@ def init(
         return op
     except Exception as e:
         _sentry.capture_exception(e)
-        logger.critical('%s: failed, %s', tag, e)  # add early logger
+        logger.critical('%s: failed, %s', tag, e, exc_info=True)  # add early logger
         raise e

pluto-ml 0.0.22__tar.gz → 0.0.24__tar.gz

pluto-ml 0.0.22tar.gz → 0.0.24tar.gz