marin-rigging 0.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- marin_rigging-0.99.dist-info/METADATA +7 -0
- marin_rigging-0.99.dist-info/RECORD +10 -0
- marin_rigging-0.99.dist-info/WHEEL +4 -0
- rigging/__init__.py +2 -0
- rigging/config_discovery.py +154 -0
- rigging/distributed_lock.py +444 -0
- rigging/filesystem.py +1046 -0
- rigging/log_setup.py +253 -0
- rigging/redaction.py +166 -0
- rigging/timing.py +627 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rigging/__init__.py,sha256=YCa8ajV8oMVGzdM-0ADm_Y7gL7bP-E4DDjDz4V7pBA0,68
|
|
2
|
+
rigging/config_discovery.py,sha256=YvEa6aH8ZcZnnvj4PLJWWmlNTj7i1hkNVWAR1YbFcik,5377
|
|
3
|
+
rigging/distributed_lock.py,sha256=kp3AOQNMFpVq_nqrUjeWzrBcc5kqKHLIv2RTVPhRvm8,16538
|
|
4
|
+
rigging/filesystem.py,sha256=ygwwa3LqLggUhV2DqX8_mEp_46nusJIUNZtQvoJJyTY,38951
|
|
5
|
+
rigging/log_setup.py,sha256=JQgnoGxlqi8P0shcIm5QP-XcfnYk1lhh88OtpkEU0VI,8616
|
|
6
|
+
rigging/redaction.py,sha256=y5NExKXWkErkFDd5qMbgV6NMqDVVmgqFwgpJ7GQDYG0,6378
|
|
7
|
+
rigging/timing.py,sha256=XlNlQudxTalHmMR6eUfahzK2kEolzRyzEj7V0aJH92Y,20923
|
|
8
|
+
marin_rigging-0.99.dist-info/METADATA,sha256=AXUVh3jOv2_hzr2n40WWnWBXGItZoHloqUcdu75GUCs,179
|
|
9
|
+
marin_rigging-0.99.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
marin_rigging-0.99.dist-info/RECORD,,
|
rigging/__init__.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Copyright The Marin Authors
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Path-agnostic config discovery for cluster YAML files.
|
|
5
|
+
|
|
6
|
+
Generic YAML config discovery helpers. Callers (e.g. iris) pass the
|
|
7
|
+
directories to search; this module knows nothing about any particular
|
|
8
|
+
marin sub-package.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import functools
|
|
12
|
+
import logging
|
|
13
|
+
from collections.abc import Sequence
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import tomllib
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_YAML_SUFFIXES = (".yaml", ".yml")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@functools.lru_cache(maxsize=128)
|
|
24
|
+
def find_project_root(start: Path | str | None = None) -> Path | None:
|
|
25
|
+
"""Find the marin workspace root.
|
|
26
|
+
|
|
27
|
+
Walks up from ``start`` (or the current working directory) looking for a
|
|
28
|
+
``pyproject.toml`` that declares ``[tool.uv.workspace]``. This uniquely
|
|
29
|
+
identifies the top-level marin root and avoids matching a workspace
|
|
30
|
+
member's pyproject (e.g. ``lib/iris/pyproject.toml``).
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The marin root ``Path``, or ``None`` when running outside a marin
|
|
34
|
+
checkout (e.g. from an installed pip package).
|
|
35
|
+
"""
|
|
36
|
+
current = Path(start).resolve() if start is not None else Path.cwd().resolve()
|
|
37
|
+
|
|
38
|
+
for directory in (current, *current.parents):
|
|
39
|
+
pp = directory / "pyproject.toml"
|
|
40
|
+
if pp.is_file() and _declares_uv_workspace(pp):
|
|
41
|
+
logger.debug("Found marin workspace root: %s", directory)
|
|
42
|
+
return directory
|
|
43
|
+
|
|
44
|
+
logger.debug("No marin workspace root found starting from %s", current)
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _declares_uv_workspace(pyproject_path: Path) -> bool:
|
|
49
|
+
"""Return True if ``pyproject_path`` declares ``[tool.uv.workspace]``."""
|
|
50
|
+
try:
|
|
51
|
+
with pyproject_path.open("rb") as f:
|
|
52
|
+
data = tomllib.load(f)
|
|
53
|
+
except (OSError, tomllib.TOMLDecodeError):
|
|
54
|
+
return False
|
|
55
|
+
return "workspace" in data.get("tool", {}).get("uv", {})
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _resolve_dirs(dirs: Sequence[Path | str]) -> list[Path]:
|
|
59
|
+
"""Expand ``~`` and resolve relative dirs against the marin project root.
|
|
60
|
+
|
|
61
|
+
An empty string resolves to the project root itself. Absolute paths are
|
|
62
|
+
returned unchanged. Relative paths are joined onto the marin project root
|
|
63
|
+
when one is found, and fall back to the current working directory otherwise.
|
|
64
|
+
"""
|
|
65
|
+
root = find_project_root()
|
|
66
|
+
resolved: list[Path] = []
|
|
67
|
+
for raw in dirs:
|
|
68
|
+
p = Path(raw).expanduser()
|
|
69
|
+
if p.is_absolute():
|
|
70
|
+
resolved.append(p)
|
|
71
|
+
elif root is not None:
|
|
72
|
+
resolved.append(root / p)
|
|
73
|
+
else:
|
|
74
|
+
resolved.append(Path.cwd() / p)
|
|
75
|
+
return resolved
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def find_configs(
|
|
79
|
+
dirs: Sequence[Path | str],
|
|
80
|
+
name: str | None = None,
|
|
81
|
+
) -> dict[str, Path]:
|
|
82
|
+
"""Discover YAML config files across ``dirs``.
|
|
83
|
+
|
|
84
|
+
Relative ``dirs`` are resolved against the marin project root (see
|
|
85
|
+
:func:`find_project_root`); absolute paths are used as-is; ``~`` is
|
|
86
|
+
expanded. An empty string resolves to the project root itself.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
dirs: Directories to search, in priority order.
|
|
90
|
+
name: When given, only return entries whose stem equals ``name``.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A dict mapping config stem (filename without ``.yaml``/``.yml``) to
|
|
94
|
+
its resolved ``Path``. When the same stem appears in multiple dirs,
|
|
95
|
+
the first (highest-priority) match wins.
|
|
96
|
+
"""
|
|
97
|
+
configs: dict[str, Path] = {}
|
|
98
|
+
for directory in _resolve_dirs(dirs):
|
|
99
|
+
if not directory.is_dir():
|
|
100
|
+
continue
|
|
101
|
+
for path in sorted(directory.iterdir()):
|
|
102
|
+
if path.suffix not in _YAML_SUFFIXES:
|
|
103
|
+
continue
|
|
104
|
+
stem = path.stem
|
|
105
|
+
if name is not None and stem != name:
|
|
106
|
+
continue
|
|
107
|
+
if stem not in configs:
|
|
108
|
+
configs[stem] = path
|
|
109
|
+
return configs
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def resolve_cluster_config(name: str, dirs: Sequence[Path | str]) -> Path:
|
|
113
|
+
"""Resolve a cluster name (or path) to an existing YAML config file.
|
|
114
|
+
|
|
115
|
+
If ``name`` is already an existing file path, it is returned directly.
|
|
116
|
+
Otherwise ``dirs`` are searched for a file whose stem matches ``name``
|
|
117
|
+
(with ``.yaml`` or ``.yml`` extensions stripped from ``name`` before
|
|
118
|
+
comparison).
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
name: Cluster name (e.g. ``"marin-dev"``) or path to an existing file.
|
|
122
|
+
dirs: Directories to search.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
The resolved ``Path`` to the config file.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
FileNotFoundError: When no matching config file is found, with a
|
|
129
|
+
message listing all searched locations.
|
|
130
|
+
"""
|
|
131
|
+
candidate = Path(name).expanduser()
|
|
132
|
+
if candidate.is_file():
|
|
133
|
+
return candidate
|
|
134
|
+
|
|
135
|
+
# Allow callers to pass either "marin-dev" or "marin-dev.yaml".
|
|
136
|
+
name_path = Path(name)
|
|
137
|
+
search_stem = name_path.stem if name_path.suffix in _YAML_SUFFIXES else name
|
|
138
|
+
|
|
139
|
+
matches = find_configs(dirs, name=search_stem)
|
|
140
|
+
if search_stem in matches:
|
|
141
|
+
logger.debug("Resolved cluster config %r -> %s", name, matches[search_stem])
|
|
142
|
+
return matches[search_stem]
|
|
143
|
+
|
|
144
|
+
searched_str = "\n ".join(str(d) for d in _resolve_dirs(dirs))
|
|
145
|
+
raise FileNotFoundError(f"No config file found for cluster {name!r}.\nSearched directories:\n {searched_str}")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def list_cluster_configs(dirs: Sequence[Path | str]) -> dict[str, Path]:
|
|
149
|
+
"""List all YAML cluster configs across ``dirs``.
|
|
150
|
+
|
|
151
|
+
Thin alias over :func:`find_configs` for callers that want the full
|
|
152
|
+
name-to-path mapping.
|
|
153
|
+
"""
|
|
154
|
+
return find_configs(dirs)
|
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
# Copyright The Marin Authors
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Generic distributed locking with lease-based semantics.
|
|
5
|
+
|
|
6
|
+
Provides lease-based distributed locks backed by a single lock file.
|
|
7
|
+
Four backend implementations are available:
|
|
8
|
+
|
|
9
|
+
- **GcsLease**: generation-based conditional writes for atomicity.
|
|
10
|
+
- **S3Lease**: conditional writes (``If-None-Match`` / ``If-Match``) for S3-compatible stores.
|
|
11
|
+
- **LocalFileLease**: ``fcntl`` file locking for mutual exclusion.
|
|
12
|
+
- **FsspecLease**: best-effort write-then-read-back (advisory only).
|
|
13
|
+
|
|
14
|
+
Use ``create_lock()`` to obtain the appropriate implementation for a given path.
|
|
15
|
+
|
|
16
|
+
The lock is lease-based: holders must periodically refresh the lease,
|
|
17
|
+
and stale leases (older than ``HEARTBEAT_TIMEOUT``) can be taken over
|
|
18
|
+
by other holders.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import abc
|
|
22
|
+
import functools
|
|
23
|
+
import json
|
|
24
|
+
import logging
|
|
25
|
+
import os
|
|
26
|
+
import threading
|
|
27
|
+
import time
|
|
28
|
+
from dataclasses import asdict, dataclass
|
|
29
|
+
|
|
30
|
+
import fsspec
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
HEARTBEAT_INTERVAL = 30 # seconds between lease refreshes
|
|
35
|
+
HEARTBEAT_TIMEOUT = 90 # seconds before considering a lease stale
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LeaseLostError(Exception):
|
|
39
|
+
"""The lease is held by another worker.
|
|
40
|
+
|
|
41
|
+
This is a fatal condition: the step must terminate immediately.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Lease:
|
|
47
|
+
"""Persisted lease state: who holds it and when it was last refreshed."""
|
|
48
|
+
|
|
49
|
+
worker_id: str
|
|
50
|
+
timestamp: float
|
|
51
|
+
|
|
52
|
+
def is_stale(self) -> bool:
|
|
53
|
+
return (time.time() - self.timestamp) > HEARTBEAT_TIMEOUT
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def default_worker_id() -> str:
|
|
57
|
+
"""Return a unique holder ID for the current host and thread."""
|
|
58
|
+
return f"{os.uname()[1]}-{threading.get_ident()}"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _is_local_path(path: str) -> bool:
|
|
62
|
+
return not path.startswith("gs://") and "://" not in path
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _is_gcs_path(path: str) -> bool:
|
|
66
|
+
return path.startswith("gs://")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _is_s3_path(path: str) -> bool:
|
|
70
|
+
return path.startswith("s3://")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Abstract base
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DistributedLease(abc.ABC):
|
|
79
|
+
"""Base class for lease-based distributed locks.
|
|
80
|
+
|
|
81
|
+
Subclasses implement storage operations (read/write/delete);
|
|
82
|
+
the locking protocol (acquire, refresh, release) is defined here.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
lock_path: Path to the lock file.
|
|
86
|
+
worker_id: Unique identifier for this lock holder.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, lock_path: str, worker_id: str | None = None):
|
|
90
|
+
self.lock_path = lock_path
|
|
91
|
+
self.worker_id = worker_id or default_worker_id()
|
|
92
|
+
|
|
93
|
+
# -- abstract storage ops ------------------------------------------------
|
|
94
|
+
|
|
95
|
+
@abc.abstractmethod
|
|
96
|
+
def _read_with_generation(self) -> tuple[int, Lease | None]:
|
|
97
|
+
"""Read lock file. Returns ``(generation, lease)`` or ``(0, None)`` if absent."""
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
@abc.abstractmethod
|
|
101
|
+
def _write(self, lease: Lease, if_generation_match: int) -> None:
|
|
102
|
+
"""Write lock file with generation/concurrency precondition."""
|
|
103
|
+
...
|
|
104
|
+
|
|
105
|
+
@abc.abstractmethod
|
|
106
|
+
def _delete(self) -> None:
|
|
107
|
+
"""Delete lock file. Must not raise if already absent."""
|
|
108
|
+
...
|
|
109
|
+
|
|
110
|
+
# -- public API ----------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def try_acquire(self) -> bool:
|
|
113
|
+
"""Try to acquire the lock. Returns True if acquired."""
|
|
114
|
+
generation, lock_data = self._read_with_generation()
|
|
115
|
+
|
|
116
|
+
if lock_data and not lock_data.is_stale():
|
|
117
|
+
if lock_data.worker_id == self.worker_id:
|
|
118
|
+
logger.debug("[%s] Already hold lock at %s", self.worker_id, self.lock_path)
|
|
119
|
+
return True
|
|
120
|
+
logger.debug("[%s] Lock %s held by %s (fresh)", self.worker_id, self.lock_path, lock_data.worker_id)
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
if lock_data:
|
|
124
|
+
logger.debug("[%s] Found stale lock at %s from %s", self.worker_id, self.lock_path, lock_data.worker_id)
|
|
125
|
+
|
|
126
|
+
lease = Lease(worker_id=self.worker_id, timestamp=time.time())
|
|
127
|
+
try:
|
|
128
|
+
self._write(lease, if_generation_match=generation)
|
|
129
|
+
except FileExistsError:
|
|
130
|
+
logger.debug("[%s] Lost lock race for %s", self.worker_id, self.lock_path)
|
|
131
|
+
return False
|
|
132
|
+
except Exception as e:
|
|
133
|
+
if "PreconditionFailed" in type(e).__name__:
|
|
134
|
+
logger.debug("[%s] Lost lock race for %s (precondition)", self.worker_id, self.lock_path)
|
|
135
|
+
return False
|
|
136
|
+
raise
|
|
137
|
+
|
|
138
|
+
return True
|
|
139
|
+
|
|
140
|
+
def refresh(self) -> None:
|
|
141
|
+
"""Refresh a lease held by the current holder.
|
|
142
|
+
|
|
143
|
+
Raises ``LeaseLostError`` if the lock is held by a different worker
|
|
144
|
+
**or** if the lock file has disappeared. A missing lock file means
|
|
145
|
+
another worker deleted it (e.g. took over a stale lease and released
|
|
146
|
+
it), so the current holder has irrecoverably lost ownership.
|
|
147
|
+
"""
|
|
148
|
+
generation, lock_data = self._read_with_generation()
|
|
149
|
+
if lock_data and lock_data.worker_id == self.worker_id:
|
|
150
|
+
self._write(Lease(self.worker_id, time.time()), generation)
|
|
151
|
+
elif lock_data is None:
|
|
152
|
+
raise LeaseLostError(f"Lease lost: lock file {self.lock_path} disappeared — another worker likely took over")
|
|
153
|
+
else:
|
|
154
|
+
raise LeaseLostError(
|
|
155
|
+
f"Lease lost: lock at {self.lock_path} held by {lock_data.worker_id}, expected {self.worker_id}"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def release(self) -> None:
|
|
159
|
+
"""Release the lock if held by this holder. Idempotent."""
|
|
160
|
+
try:
|
|
161
|
+
_, lock_data = self._read_with_generation()
|
|
162
|
+
if lock_data and lock_data.worker_id == self.worker_id:
|
|
163
|
+
self._delete()
|
|
164
|
+
logger.info("Released lock path=%s worker=%s", self.lock_path, self.worker_id)
|
|
165
|
+
except FileNotFoundError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
def has_active_holder(self) -> bool:
|
|
169
|
+
"""Check if any holder has an active (non-stale) lock."""
|
|
170
|
+
try:
|
|
171
|
+
_, lock_data = self._read_with_generation()
|
|
172
|
+
except FileNotFoundError:
|
|
173
|
+
return False
|
|
174
|
+
return lock_data is not None and not lock_data.is_stale()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# GCS backend
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class GcsLease(DistributedLease):
|
|
183
|
+
"""GCS-backed lease using generation-based conditional writes."""
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def _parse_gcs_path(path: str) -> tuple[str, str]:
|
|
187
|
+
"""Parse ``gs://bucket/path`` into ``(bucket, blob_path)``."""
|
|
188
|
+
path = path[5:] # Remove gs://
|
|
189
|
+
bucket, _, blob_path = path.partition("/")
|
|
190
|
+
return (bucket, blob_path)
|
|
191
|
+
|
|
192
|
+
def _read_with_generation(self) -> tuple[int, Lease | None]:
|
|
193
|
+
from google.api_core.exceptions import NotFound
|
|
194
|
+
from google.cloud import storage
|
|
195
|
+
|
|
196
|
+
client = storage.Client()
|
|
197
|
+
bucket_name, blob_path = self._parse_gcs_path(self.lock_path)
|
|
198
|
+
bucket = client.bucket(bucket_name)
|
|
199
|
+
blob = bucket.get_blob(blob_path)
|
|
200
|
+
if blob is None:
|
|
201
|
+
return (0, None)
|
|
202
|
+
try:
|
|
203
|
+
data = json.loads(blob.download_as_string())
|
|
204
|
+
except NotFound:
|
|
205
|
+
# Blob was deleted between get_blob and download_as_string
|
|
206
|
+
logger.debug("[%s] Lock blob %s disappeared during read (race)", self.worker_id, self.lock_path)
|
|
207
|
+
return (0, None)
|
|
208
|
+
return (blob.generation, Lease(**data))
|
|
209
|
+
|
|
210
|
+
def _write(self, lease: Lease, if_generation_match: int) -> None:
|
|
211
|
+
from google.cloud import storage
|
|
212
|
+
|
|
213
|
+
client = storage.Client()
|
|
214
|
+
bucket_name, blob_path = self._parse_gcs_path(self.lock_path)
|
|
215
|
+
bucket = client.bucket(bucket_name)
|
|
216
|
+
blob = bucket.blob(blob_path)
|
|
217
|
+
blob.upload_from_string(json.dumps(asdict(lease)), if_generation_match=if_generation_match)
|
|
218
|
+
|
|
219
|
+
def _delete(self) -> None:
|
|
220
|
+
from google.api_core.exceptions import NotFound
|
|
221
|
+
from google.cloud import storage
|
|
222
|
+
|
|
223
|
+
client = storage.Client()
|
|
224
|
+
bucket_name, blob_path = self._parse_gcs_path(self.lock_path)
|
|
225
|
+
bucket = client.bucket(bucket_name)
|
|
226
|
+
blob = bucket.blob(blob_path)
|
|
227
|
+
try:
|
|
228
|
+
blob.delete()
|
|
229
|
+
except NotFound:
|
|
230
|
+
logger.debug("Lock blob %s already deleted", self.lock_path)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
# S3 backend
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class S3Lease(DistributedLease):
|
|
239
|
+
"""S3-backed lease using conditional writes (If-None-Match / If-Match).
|
|
240
|
+
|
|
241
|
+
Works with any S3-compatible store that supports conditional PutObject
|
|
242
|
+
(AWS S3, Cloudflare R2, MinIO, etc.). Uses botocore directly (available
|
|
243
|
+
transitively via s3fs) to inject the conditional headers that the
|
|
244
|
+
high-level SDKs do not expose.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
def __init__(self, lock_path: str, worker_id: str | None = None):
|
|
248
|
+
super().__init__(lock_path, worker_id)
|
|
249
|
+
self._last_etag: str | None = None
|
|
250
|
+
|
|
251
|
+
@staticmethod
|
|
252
|
+
def _parse_s3_path(path: str) -> tuple[str, str]:
|
|
253
|
+
path = path[5:] # Remove s3://
|
|
254
|
+
bucket, _, key = path.partition("/")
|
|
255
|
+
return (bucket, key)
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
@functools.cache
|
|
259
|
+
def _make_client(cache_key: str = ""):
|
|
260
|
+
"""Create a botocore S3 client, cached per *cache_key*.
|
|
261
|
+
|
|
262
|
+
Conditional writes inject temporary event hooks on the client's event bus
|
|
263
|
+
(register before put, unregister after). A single shared client is not
|
|
264
|
+
thread-safe: concurrent ``_write`` calls interleave hooks, corrupting
|
|
265
|
+
headers and causing ``SignatureDoesNotMatch``. Keying by lock path gives
|
|
266
|
+
each ``S3Lease`` instance its own client, avoiding the race.
|
|
267
|
+
"""
|
|
268
|
+
import botocore.config
|
|
269
|
+
import botocore.session
|
|
270
|
+
|
|
271
|
+
session = botocore.session.get_session()
|
|
272
|
+
endpoint_url = os.environ.get("AWS_ENDPOINT_URL_S3") or os.environ.get("AWS_ENDPOINT_URL")
|
|
273
|
+
kwargs: dict = {}
|
|
274
|
+
if endpoint_url:
|
|
275
|
+
kwargs["endpoint_url"] = endpoint_url
|
|
276
|
+
# Some S3-compatible endpoints (CoreWeave cwobject.com, cwlota.com)
|
|
277
|
+
# reject path-style requests. Virtual-host style is the modern
|
|
278
|
+
# default for AWS S3 anyway, so always prefer it when a custom
|
|
279
|
+
# endpoint is in use.
|
|
280
|
+
kwargs["config"] = botocore.config.Config(s3={"addressing_style": "virtual"})
|
|
281
|
+
return session.create_client("s3", **kwargs)
|
|
282
|
+
|
|
283
|
+
def _read_with_generation(self) -> tuple[int, Lease | None]:
|
|
284
|
+
from botocore.exceptions import ClientError
|
|
285
|
+
|
|
286
|
+
client = self._make_client(self.lock_path)
|
|
287
|
+
bucket, key = self._parse_s3_path(self.lock_path)
|
|
288
|
+
try:
|
|
289
|
+
resp = client.get_object(Bucket=bucket, Key=key)
|
|
290
|
+
data = json.loads(resp["Body"].read())
|
|
291
|
+
self._last_etag = resp["ETag"]
|
|
292
|
+
return (1, Lease(**data))
|
|
293
|
+
except ClientError as e:
|
|
294
|
+
if e.response["Error"]["Code"] == "NoSuchKey":
|
|
295
|
+
self._last_etag = None
|
|
296
|
+
return (0, None)
|
|
297
|
+
raise
|
|
298
|
+
|
|
299
|
+
def _write(self, lease: Lease, if_generation_match: int) -> None:
|
|
300
|
+
from botocore.exceptions import ClientError
|
|
301
|
+
|
|
302
|
+
client = self._make_client(self.lock_path)
|
|
303
|
+
bucket, key = self._parse_s3_path(self.lock_path)
|
|
304
|
+
body = json.dumps(asdict(lease)).encode()
|
|
305
|
+
|
|
306
|
+
if if_generation_match == 0:
|
|
307
|
+
condition_header = {"If-None-Match": "*"}
|
|
308
|
+
else:
|
|
309
|
+
assert self._last_etag is not None, "Cannot conditionally update without a prior read"
|
|
310
|
+
condition_header = {"If-Match": self._last_etag}
|
|
311
|
+
|
|
312
|
+
def inject_condition(request, **kwargs):
|
|
313
|
+
for key, value in condition_header.items():
|
|
314
|
+
request.headers[key] = value
|
|
315
|
+
|
|
316
|
+
client.meta.events.register("before-sign.s3.PutObject", inject_condition)
|
|
317
|
+
try:
|
|
318
|
+
client.put_object(Bucket=bucket, Key=key, Body=body)
|
|
319
|
+
except ClientError as e:
|
|
320
|
+
if e.response["Error"]["Code"] in ("PreconditionFailed", "412"):
|
|
321
|
+
raise FileExistsError(f"Conditional write failed for {self.lock_path}") from e
|
|
322
|
+
raise
|
|
323
|
+
finally:
|
|
324
|
+
client.meta.events.unregister("before-sign.s3.PutObject", inject_condition)
|
|
325
|
+
|
|
326
|
+
def _delete(self) -> None:
|
|
327
|
+
client = self._make_client(self.lock_path)
|
|
328
|
+
bucket, key = self._parse_s3_path(self.lock_path)
|
|
329
|
+
client.delete_object(Bucket=bucket, Key=key)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# ---------------------------------------------------------------------------
|
|
333
|
+
# Local filesystem backend
|
|
334
|
+
# ---------------------------------------------------------------------------
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class LocalFileLease(DistributedLease):
|
|
338
|
+
"""Local-filesystem lease using ``fcntl`` file locking."""
|
|
339
|
+
|
|
340
|
+
def _read_with_generation(self) -> tuple[int, Lease | None]:
|
|
341
|
+
import fcntl
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
with open(self.lock_path, "r") as f:
|
|
345
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
|
346
|
+
content = f.read()
|
|
347
|
+
if not content:
|
|
348
|
+
return (0, None)
|
|
349
|
+
data = json.loads(content)
|
|
350
|
+
return (1, Lease(**data))
|
|
351
|
+
except FileNotFoundError:
|
|
352
|
+
return (0, None)
|
|
353
|
+
|
|
354
|
+
def _write(self, lease: Lease, if_generation_match: int) -> None:
|
|
355
|
+
import fcntl
|
|
356
|
+
|
|
357
|
+
parent = os.path.dirname(self.lock_path)
|
|
358
|
+
os.makedirs(parent, exist_ok=True)
|
|
359
|
+
|
|
360
|
+
with open(self.lock_path, "a+") as f:
|
|
361
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
362
|
+
f.seek(0)
|
|
363
|
+
content = f.read()
|
|
364
|
+
if content:
|
|
365
|
+
current = Lease(**json.loads(content))
|
|
366
|
+
if not current.is_stale() and current.worker_id != lease.worker_id:
|
|
367
|
+
raise FileExistsError(f"Lock held by {current.worker_id}")
|
|
368
|
+
f.seek(0)
|
|
369
|
+
f.truncate()
|
|
370
|
+
f.write(json.dumps(asdict(lease)))
|
|
371
|
+
|
|
372
|
+
def _delete(self) -> None:
|
|
373
|
+
try:
|
|
374
|
+
os.remove(self.lock_path)
|
|
375
|
+
except FileNotFoundError:
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
# ---------------------------------------------------------------------------
|
|
380
|
+
# fsspec best-effort backend
|
|
381
|
+
# ---------------------------------------------------------------------------
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class FsspecLease(DistributedLease):
|
|
385
|
+
"""Best-effort lease for arbitrary fsspec filesystems."""
|
|
386
|
+
|
|
387
|
+
def _get_fs(self) -> tuple[fsspec.AbstractFileSystem, str]:
|
|
388
|
+
"""Return ``(fs, path)`` for the lock path via fsspec."""
|
|
389
|
+
return fsspec.core.url_to_fs(self.lock_path)
|
|
390
|
+
|
|
391
|
+
def _read_with_generation(self) -> tuple[int, Lease | None]:
|
|
392
|
+
fs, path = self._get_fs()
|
|
393
|
+
try:
|
|
394
|
+
with fs.open(path, "r") as f:
|
|
395
|
+
content = f.read()
|
|
396
|
+
if not content:
|
|
397
|
+
return (0, None)
|
|
398
|
+
data = json.loads(content)
|
|
399
|
+
return (1, Lease(**data))
|
|
400
|
+
except FileNotFoundError:
|
|
401
|
+
return (0, None)
|
|
402
|
+
|
|
403
|
+
def _write(self, lease: Lease, if_generation_match: int) -> None:
|
|
404
|
+
"""Best-effort lock: write lease, then read back to check if we won."""
|
|
405
|
+
fs, path = self._get_fs()
|
|
406
|
+
data = json.dumps(asdict(lease))
|
|
407
|
+
parent = path.rsplit("/", 1)[0] if "/" in path else ""
|
|
408
|
+
if parent:
|
|
409
|
+
fs.makedirs(parent, exist_ok=True)
|
|
410
|
+
with fs.open(path, "w") as f:
|
|
411
|
+
f.write(data)
|
|
412
|
+
# Read back and check if our write stuck (best-effort race detection)
|
|
413
|
+
time.sleep(0.1)
|
|
414
|
+
try:
|
|
415
|
+
with fs.open(path, "r") as f:
|
|
416
|
+
readback = json.loads(f.read())
|
|
417
|
+
if readback.get("worker_id") != lease.worker_id:
|
|
418
|
+
raise FileExistsError(f"Lock race lost to {readback.get('worker_id')}")
|
|
419
|
+
except FileNotFoundError as err:
|
|
420
|
+
raise FileExistsError("Lock file disappeared after write") from err
|
|
421
|
+
|
|
422
|
+
def _delete(self) -> None:
|
|
423
|
+
fs, path = self._get_fs()
|
|
424
|
+
try:
|
|
425
|
+
fs.rm(path)
|
|
426
|
+
except FileNotFoundError:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
# ---------------------------------------------------------------------------
|
|
431
|
+
# Factory
|
|
432
|
+
# ---------------------------------------------------------------------------
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def create_lock(lock_path: str, worker_id: str | None = None) -> DistributedLease:
|
|
436
|
+
"""Create the appropriate lease implementation for *lock_path*."""
|
|
437
|
+
if _is_gcs_path(lock_path):
|
|
438
|
+
return GcsLease(lock_path, worker_id)
|
|
439
|
+
elif _is_s3_path(lock_path):
|
|
440
|
+
return S3Lease(lock_path, worker_id)
|
|
441
|
+
elif _is_local_path(lock_path):
|
|
442
|
+
return LocalFileLease(lock_path, worker_id)
|
|
443
|
+
else:
|
|
444
|
+
return FsspecLease(lock_path, worker_id)
|