reslock 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ .eggs/
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ ENV/
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+
22
+ # Testing
23
+ .pytest_cache/
24
+ .coverage
25
+ htmlcov/
26
+ .pyright/
27
+ .ruff_cache/
28
+
29
+ # OS
30
+ .DS_Store
31
+ Thumbs.db
reslock-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: reslock
3
+ Version: 0.1.0
4
+ Summary: Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU) across processes
5
+ Project-URL: Homepage, https://github.com/mo22/reslock
6
+ Project-URL: Repository, https://github.com/mo22/reslock
7
+ Author-email: Moritz Möller <mm@mxs.de>
8
+ License-Expression: MIT
9
+ Keywords: gpu,lock,resource,scheduling,vram
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: click>=8.0.0
19
+ Requires-Dist: portalocker>=2.0.0
20
+ Requires-Dist: pydantic>=2.0.0
21
+ Requires-Dist: rich>=13.0.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: pyright>=1.1.0; extra == 'dev'
24
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
25
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
26
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # reslock
30
+
31
+ Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU cores) across multiple processes on a single machine.
32
+
33
+ ## Problem
34
+
35
+ Multiple GPU-consuming processes (llama.cpp, whisper, vLLM, training jobs) compete for limited resources — especially VRAM. Without coordination, they OOM or degrade each other.
36
+
37
+ ## How it works
38
+
39
+ - All coordination happens through a single JSON state file — no daemon required
40
+ - Processes coordinate via file locking (held only during reads/writes, not for lease duration)
41
+ - Dead processes are automatically cleaned up via PID checking
42
+ - Priority queue determines which waiter gets resources next
43
+ - Reclaimable leases allow loaded models to be preempted by higher-priority work
44
+
45
+ ## Install
46
+
47
+ ```bash
48
+ pip install reslock
49
+ ```
50
+
51
+ ## Python API
52
+
53
+ ```python
54
+ from reslock import ResourcePool
55
+
56
+ pool = ResourcePool() # uses ~/.reslock/state.json
57
+
58
+ # Context manager — blocks until resources are available
59
+ with pool.acquire(vram_mb=4000, priority=5, label="whisper") as lease:
60
+ run_whisper(audio_file)
61
+
62
+ # Non-blocking
63
+ lease = pool.try_acquire(vram_mb=4000)
64
+ if lease:
65
+ try:
66
+ do_work()
67
+ finally:
68
+ lease.release()
69
+
70
+ # Async
71
+ async with pool.acquire_async(vram_mb=4000) as lease:
72
+ await run_inference()
73
+
74
+ # Reclaimable lease — can be preempted
75
+ lease = pool.acquire(vram_mb=4000, reclaimable=True)
76
+ load_model()
77
+ # ... later:
78
+ if lease.reclaim_requested:
79
+ unload_model()
80
+ lease.release()
81
+
82
+ # Check status
83
+ status = pool.status()
84
+ print(status.available) # free resources
85
+ ```
86
+
87
+ ## CLI
88
+
89
+ ```bash
90
+ # Initialize (auto-detects GPU)
91
+ reslock init
92
+
93
+ # Set resources manually
94
+ reslock set vram_mb 24000
95
+ reslock set gpu_slots 2
96
+
97
+ # Show status
98
+ reslock status
99
+
100
+ # Run a command with reserved resources
101
+ reslock run --vram 4G llama-cli --model model.gguf
102
+ reslock run --vram 8G --priority 10 --label "llama-70b" llama-cli ...
103
+ reslock run --vram 4G --ram 16G --cpu 4 python train.py
104
+
105
+ # Manage leases
106
+ reslock list
107
+ reslock release abc-123
108
+ reslock release --label whisper
109
+ reslock reset
110
+ ```
111
+
112
+ ## How resources work
113
+
114
+ Resources are named quantities with a total capacity. Resource names are arbitrary strings — define whatever you need:
115
+
116
+ ```bash
117
+ reslock set vram_mb 24000
118
+ reslock set ram_mb 65536
119
+ reslock set gpu_slots 2
120
+ ```
121
+
122
+ Leases reserve amounts from these pools. When a lease is released (or its process dies), the resources become available again.
123
+
124
+ ## Priority queue
125
+
126
+ When resources aren't immediately available, requests enter a priority queue. Higher priority number = more urgent. Ties are broken by arrival time (FIFO).
127
+
128
+ ## Reclaimable leases
129
+
130
+ A process can mark its lease as **reclaimable** — "I'm using this, but can give it up if needed." When a higher-priority request needs those resources, `reclaim_requested` is set to `True`. The lease holder cooperates by releasing.
131
+
132
+ ## Development
133
+
134
+ ```bash
135
+ uv venv && uv pip install -e ".[dev]"
136
+ pytest
137
+ ruff check src/ tests/
138
+ ```
@@ -0,0 +1,110 @@
1
+ # reslock
2
+
3
+ Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU cores) across multiple processes on a single machine.
4
+
5
+ ## Problem
6
+
7
+ Multiple GPU-consuming processes (llama.cpp, whisper, vLLM, training jobs) compete for limited resources — especially VRAM. Without coordination, they OOM or degrade each other.
8
+
9
+ ## How it works
10
+
11
+ - All coordination happens through a single JSON state file — no daemon required
12
+ - Processes coordinate via file locking (held only during reads/writes, not for lease duration)
13
+ - Dead processes are automatically cleaned up via PID checking
14
+ - Priority queue determines which waiter gets resources next
15
+ - Reclaimable leases allow loaded models to be preempted by higher-priority work
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install reslock
21
+ ```
22
+
23
+ ## Python API
24
+
25
+ ```python
26
+ from reslock import ResourcePool
27
+
28
+ pool = ResourcePool() # uses ~/.reslock/state.json
29
+
30
+ # Context manager — blocks until resources are available
31
+ with pool.acquire(vram_mb=4000, priority=5, label="whisper") as lease:
32
+ run_whisper(audio_file)
33
+
34
+ # Non-blocking
35
+ lease = pool.try_acquire(vram_mb=4000)
36
+ if lease:
37
+ try:
38
+ do_work()
39
+ finally:
40
+ lease.release()
41
+
42
+ # Async
43
+ async with pool.acquire_async(vram_mb=4000) as lease:
44
+ await run_inference()
45
+
46
+ # Reclaimable lease — can be preempted
47
+ lease = pool.acquire(vram_mb=4000, reclaimable=True)
48
+ load_model()
49
+ # ... later:
50
+ if lease.reclaim_requested:
51
+ unload_model()
52
+ lease.release()
53
+
54
+ # Check status
55
+ status = pool.status()
56
+ print(status.available) # free resources
57
+ ```
58
+
59
+ ## CLI
60
+
61
+ ```bash
62
+ # Initialize (auto-detects GPU)
63
+ reslock init
64
+
65
+ # Set resources manually
66
+ reslock set vram_mb 24000
67
+ reslock set gpu_slots 2
68
+
69
+ # Show status
70
+ reslock status
71
+
72
+ # Run a command with reserved resources
73
+ reslock run --vram 4G llama-cli --model model.gguf
74
+ reslock run --vram 8G --priority 10 --label "llama-70b" llama-cli ...
75
+ reslock run --vram 4G --ram 16G --cpu 4 python train.py
76
+
77
+ # Manage leases
78
+ reslock list
79
+ reslock release abc-123
80
+ reslock release --label whisper
81
+ reslock reset
82
+ ```
83
+
84
+ ## How resources work
85
+
86
+ Resources are named quantities with a total capacity. Resource names are arbitrary strings — define whatever you need:
87
+
88
+ ```bash
89
+ reslock set vram_mb 24000
90
+ reslock set ram_mb 65536
91
+ reslock set gpu_slots 2
92
+ ```
93
+
94
+ Leases reserve amounts from these pools. When a lease is released (or its process dies), the resources become available again.
95
+
96
+ ## Priority queue
97
+
98
+ When resources aren't immediately available, requests enter a priority queue. Higher priority number = more urgent. Ties are broken by arrival time (FIFO).
99
+
100
+ ## Reclaimable leases
101
+
102
+ A process can mark its lease as **reclaimable** — "I'm using this, but can give it up if needed." When a higher-priority request needs those resources, `reclaim_requested` is set to `True`. The lease holder cooperates by releasing.
103
+
104
+ ## Development
105
+
106
+ ```bash
107
+ uv venv && uv pip install -e ".[dev]"
108
+ pytest
109
+ ruff check src/ tests/
110
+ ```
@@ -0,0 +1,83 @@
1
+ [project]
2
+ name = "reslock"
3
+ version = "0.1.0"
4
+ description = "Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU) across processes"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = "MIT"
8
+ authors = [{ name = "Moritz Möller", email = "mm@mxs.de" }]
9
+ keywords = ["gpu", "vram", "resource", "lock", "scheduling"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ ]
19
+ dependencies = [
20
+ "portalocker>=2.0.0",
21
+ "click>=8.0.0",
22
+ "rich>=13.0.0",
23
+ "pydantic>=2.0.0",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ dev = [
28
+ "pytest>=8.0.0",
29
+ "pytest-asyncio>=0.23.0",
30
+ "ruff>=0.4.0",
31
+ "pyright>=1.1.0",
32
+ ]
33
+
34
+ [project.scripts]
35
+ reslock = "reslock.cli:main"
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/mo22/reslock"
39
+ Repository = "https://github.com/mo22/reslock"
40
+
41
+ [build-system]
42
+ requires = ["hatchling"]
43
+ build-backend = "hatchling.build"
44
+
45
+ [tool.hatch.build.targets.wheel]
46
+ packages = ["src/reslock"]
47
+
48
+ [tool.hatch.build.targets.sdist]
49
+ include = ["src/reslock"]
50
+
51
+ [tool.ruff]
52
+ target-version = "py310"
53
+ line-length = 100
54
+ src = ["src"]
55
+
56
+ [tool.ruff.lint]
57
+ select = [
58
+ "E", # pycodestyle errors
59
+ "W", # pycodestyle warnings
60
+ "F", # pyflakes
61
+ "I", # isort
62
+ "B", # flake8-bugbear
63
+ "C4", # flake8-comprehensions
64
+ "UP", # pyupgrade
65
+ "ARG", # flake8-unused-arguments
66
+ "SIM", # flake8-simplify
67
+ ]
68
+ ignore = [
69
+ "E501", # line too long (handled by formatter)
70
+ "B008", # function calls in argument defaults (needed for Click)
71
+ ]
72
+
73
+ [tool.ruff.lint.isort]
74
+ known-first-party = ["reslock"]
75
+
76
+ [tool.pyright]
77
+ pythonVersion = "3.10"
78
+ typeCheckingMode = "strict"
79
+ ignore = ["**/node_modules", "**/__pycache__"]
80
+
81
+ [tool.pytest.ini_options]
82
+ asyncio_mode = "auto"
83
+ testpaths = ["tests"]
@@ -0,0 +1,8 @@
1
+ """reslock — Resource lock manager for coordinating shared system resources."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from reslock.models import Lease, PoolStatus, QueueEntry, State
6
+ from reslock.pool import LeaseHandle, ResourcePool
7
+
8
+ __all__ = ["Lease", "LeaseHandle", "PoolStatus", "QueueEntry", "ResourcePool", "State"]
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from reslock.models import State
6
+
7
+
8
+ def is_pid_alive(pid: int) -> bool:
9
+ try:
10
+ os.kill(pid, 0)
11
+ except ProcessLookupError:
12
+ return False
13
+ except PermissionError:
14
+ return True
15
+ return True
16
+
17
+
18
+ def remove_dead_processes(state: State) -> None:
19
+ state.leases = [lease for lease in state.leases if is_pid_alive(lease.pid)]
20
+ state.queue = [e for e in state.queue if is_pid_alive(e.pid)]
@@ -0,0 +1,313 @@
1
+ """CLI for reslock."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import signal
7
+ import subprocess
8
+ import sys
9
+ import threading
10
+ import time
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+
14
+ import click
15
+ from rich.console import Console
16
+ from rich.table import Table
17
+
18
+ from reslock.detect import detect_gpu_vram_mb
19
+ from reslock.models import State
20
+ from reslock.pool import ResourcePool
21
+ from reslock.state import DEFAULT_STATE_PATH, ensure_state_file, transact
22
+
23
+ console = Console()
24
+
25
+
26
+ def _parse_size(value: str) -> int:
27
+ """Parse a size string like '4G', '500M', or plain number (MB)."""
28
+ m = re.match(r"^(\d+(?:\.\d+)?)\s*([gGmM]?)$", value)
29
+ if not m:
30
+ raise click.BadParameter(f"Invalid size: {value}")
31
+ num = float(m.group(1))
32
+ unit = m.group(2).upper()
33
+ if unit == "G":
34
+ return int(num * 1024)
35
+ return int(num)
36
+
37
+
38
+ @click.group()
39
+ @click.version_option(package_name="reslock")
40
+ def main() -> None:
41
+ """Resource lock manager for coordinating shared system resources."""
42
+
43
+
44
+ @main.command()
45
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
46
+ def init(state: Path | None) -> None:
47
+ """Initialize reslock state file, auto-detecting GPU if available."""
48
+ path = state or DEFAULT_STATE_PATH
49
+ ensure_state_file(path)
50
+
51
+ gpu = detect_gpu_vram_mb()
52
+
53
+ def _init(st: State) -> None:
54
+ for key, val in gpu.items():
55
+ if key not in st.resources:
56
+ st.resources[key] = val
57
+
58
+ transact(path, _init)
59
+
60
+ if gpu:
61
+ console.print(f"[green]Detected:[/green] vram_mb={gpu.get('vram_mb', 0)}")
62
+ console.print(f"[green]State file:[/green] {path}")
63
+
64
+
65
+ @main.command(name="set")
66
+ @click.argument("resource")
67
+ @click.argument("value", type=int)
68
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
69
+ def set_resource(resource: str, value: int, state: Path | None) -> None:
70
+ """Set a resource capacity (e.g., reslock set vram_mb 24000)."""
71
+ path = state or DEFAULT_STATE_PATH
72
+ ensure_state_file(path)
73
+
74
+ def _set(st: State) -> None:
75
+ st.resources[resource] = value
76
+
77
+ transact(path, _set)
78
+ console.print(f"[green]Set[/green] {resource} = {value}")
79
+
80
+
81
+ @main.command()
82
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
83
+ def status(state: Path | None) -> None:
84
+ """Show current resource status, leases, and queue."""
85
+ path = state or DEFAULT_STATE_PATH
86
+ ensure_state_file(path)
87
+ pool = ResourcePool(path)
88
+ st = pool.status()
89
+
90
+ if st.resources:
91
+ table = Table(title="Resources")
92
+ table.add_column("Resource", style="cyan")
93
+ table.add_column("Total", justify="right")
94
+ table.add_column("Used", justify="right")
95
+ table.add_column("Free", justify="right")
96
+ for key, total in st.resources.items():
97
+ free = st.available.get(key, 0)
98
+ used = total - free
99
+ table.add_row(key, str(total), str(used), str(free))
100
+ console.print(table)
101
+ else:
102
+ console.print("[dim]No resources configured. Run 'reslock init' or 'reslock set'.[/dim]")
103
+
104
+ if st.leases:
105
+ table = Table(title=f"Leases ({len(st.leases)} active)")
106
+ table.add_column("ID", style="cyan")
107
+ table.add_column("PID")
108
+ table.add_column("Resources")
109
+ table.add_column("Priority", justify="right")
110
+ table.add_column("Label")
111
+ table.add_column("Flags")
112
+ table.add_column("Age")
113
+ now = datetime.now(timezone.utc)
114
+ for lease in st.leases:
115
+ age = now - lease.acquired_at
116
+ mins = int(age.total_seconds()) // 60
117
+ age_str = f"{mins}m ago" if mins > 0 else f"{int(age.total_seconds())}s ago"
118
+ res_str = ", ".join(f"{k}={v}" for k, v in lease.resources.items())
119
+ flags: list[str] = []
120
+ if lease.reclaimable:
121
+ flags.append("reclaimable")
122
+ if lease.reclaim_requested:
123
+ flags.append("reclaim_requested")
124
+ table.add_row(
125
+ lease.id,
126
+ str(lease.pid),
127
+ res_str,
128
+ str(lease.priority),
129
+ lease.label or "",
130
+ " ".join(flags),
131
+ age_str,
132
+ )
133
+ console.print(table)
134
+
135
+ if st.queue:
136
+ table = Table(title=f"Queue ({len(st.queue)} waiting)")
137
+ table.add_column("ID", style="cyan")
138
+ table.add_column("PID")
139
+ table.add_column("Resources")
140
+ table.add_column("Priority", justify="right")
141
+ table.add_column("Label")
142
+ table.add_column("Queued")
143
+ now = datetime.now(timezone.utc)
144
+ for e in st.queue:
145
+ age = now - e.queued_at
146
+ secs = int(age.total_seconds())
147
+ queued_str = f"{secs // 60}m ago" if secs >= 60 else f"{secs}s ago"
148
+ res_str = ", ".join(f"{k}={v}" for k, v in e.resources.items())
149
+ table.add_row(e.id, str(e.pid), res_str, str(e.priority), e.label or "", queued_str)
150
+ console.print(table)
151
+
152
+ if not st.leases and not st.queue:
153
+ console.print("[dim]No active leases or queued requests.[/dim]")
154
+
155
+
156
+ @main.command()
157
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
158
+ def list_(state: Path | None) -> None:
159
+ """List active leases."""
160
+ path = state or DEFAULT_STATE_PATH
161
+ ensure_state_file(path)
162
+ pool = ResourcePool(path)
163
+ st = pool.status()
164
+ if not st.leases:
165
+ console.print("[dim]No active leases.[/dim]")
166
+ return
167
+ for lease in st.leases:
168
+ res_str = ", ".join(f"{k}={v}" for k, v in lease.resources.items())
169
+ lbl = f" label={lease.label}" if lease.label else ""
170
+ console.print(f" {lease.id} pid={lease.pid} {res_str} prio={lease.priority}{lbl}")
171
+
172
+
173
+ # Register "list" as the CLI command name
174
+ list_.__name__ = "list"
175
+ main.add_command(list_, "list")
176
+
177
+
178
+ @main.command()
179
+ @click.argument("lease_id", required=False)
180
+ @click.option("--label", "-l", default=None, help="Release by label")
181
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
182
+ def release(lease_id: str | None, label: str | None, state: Path | None) -> None:
183
+ """Release a lease by ID or label."""
184
+ if not lease_id and not label:
185
+ raise click.UsageError("Provide a lease ID or --label")
186
+ path = state or DEFAULT_STATE_PATH
187
+ ensure_state_file(path)
188
+ released: list[str] = []
189
+
190
+ def _release(st: State) -> None:
191
+ before = len(st.leases)
192
+ if lease_id:
193
+ st.leases = [ls for ls in st.leases if ls.id != lease_id]
194
+ elif label:
195
+ st.leases = [ls for ls in st.leases if ls.label != label]
196
+ released.extend(["x"] * (before - len(st.leases)))
197
+
198
+ transact(path, _release)
199
+ if released:
200
+ console.print(f"[green]Released {len(released)} lease(s).[/green]")
201
+ else:
202
+ console.print("[yellow]No matching lease found.[/yellow]")
203
+
204
+
205
+ @main.command()
206
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
207
+ def reset(state: Path | None) -> None:
208
+ """Clear all state (leases, queue)."""
209
+ path = state or DEFAULT_STATE_PATH
210
+ ensure_state_file(path)
211
+
212
+ def _reset(st: State) -> None:
213
+ st.leases.clear()
214
+ st.queue.clear()
215
+
216
+ transact(path, _reset)
217
+ console.print("[green]State cleared.[/green]")
218
+
219
+
220
+ @main.command()
221
+ def schema() -> None:
222
+ """Print the JSON schema for the state file."""
223
+ import json
224
+
225
+ click.echo(json.dumps(State.model_json_schema(), indent=2))
226
+
227
+
228
+ @main.command()
229
+ @click.option("--vram", default=None, help="VRAM to reserve (e.g., 4G, 500M)")
230
+ @click.option("--ram", default=None, help="RAM to reserve (e.g., 16G)")
231
+ @click.option("--cpu", type=int, default=None, help="CPU cores to reserve")
232
+ @click.option("--priority", "-p", type=int, default=0, help="Priority (higher = more urgent)")
233
+ @click.option("--label", "-l", default=None, help="Label for this lease")
234
+ @click.option(
235
+ "--reclaimable", is_flag=True, help="Allow lease to be reclaimed by higher-priority requests"
236
+ )
237
+ @click.option(
238
+ "--reclaim-signal",
239
+ default="SIGTERM",
240
+ help="Signal to send to the child process when reclaim is requested (default: SIGTERM)",
241
+ )
242
+ @click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
243
+ @click.argument("command", nargs=-1, required=True)
244
+ def run(
245
+ vram: str | None,
246
+ ram: str | None,
247
+ cpu: int | None,
248
+ priority: int,
249
+ label: str | None,
250
+ reclaimable: bool,
251
+ reclaim_signal: str,
252
+ state: Path | None,
253
+ command: tuple[str, ...],
254
+ ) -> None:
255
+ """Reserve resources and run a command.
256
+
257
+ With --reclaimable, the lease can be reclaimed by higher-priority requests.
258
+ When reclaim is requested, the specified signal is sent to the child process.
259
+ """
260
+ resources: dict[str, int] = {}
261
+ if vram:
262
+ resources["vram_mb"] = _parse_size(vram)
263
+ if ram:
264
+ resources["ram_mb"] = _parse_size(ram)
265
+ if cpu:
266
+ resources["cpu_cores"] = cpu
267
+
268
+ if not resources:
269
+ raise click.UsageError("Specify at least one resource (--vram, --ram, --cpu)")
270
+
271
+ sig = getattr(signal, reclaim_signal, None)
272
+ if sig is None:
273
+ raise click.BadParameter(f"Unknown signal: {reclaim_signal}")
274
+
275
+ path = state or DEFAULT_STATE_PATH
276
+ ensure_state_file(path)
277
+ pool = ResourcePool(path)
278
+
279
+ console.print(f"[dim]Waiting for resources: {resources}...[/dim]")
280
+ with pool.acquire(
281
+ priority=priority, label=label, reclaimable=reclaimable, **resources
282
+ ) as lease:
283
+ console.print(f"[green]Acquired lease {lease.id}[/green]")
284
+ proc = subprocess.Popen(list(command))
285
+
286
+ def _sighandler(signum: int, _frame: object) -> None:
287
+ proc.send_signal(signum)
288
+
289
+ signal.signal(signal.SIGTERM, _sighandler)
290
+ signal.signal(signal.SIGINT, _sighandler)
291
+
292
+ if reclaimable:
293
+
294
+ def _watch_reclaim() -> None:
295
+ while proc.poll() is None:
296
+ if lease.reclaim_requested:
297
+ console.print(
298
+ f"[yellow]Reclaim requested, sending {reclaim_signal} to child[/yellow]"
299
+ )
300
+ proc.send_signal(sig)
301
+ return
302
+ time.sleep(0.5)
303
+
304
+ threading.Thread(target=_watch_reclaim, daemon=True).start()
305
+
306
+ returncode = proc.wait()
307
+
308
+ console.print(f"[dim]Lease released. Process exited with code {returncode}.[/dim]")
309
+ sys.exit(returncode)
310
+
311
+
312
+ if __name__ == "__main__":
313
+ main()
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ import shutil
4
+ import subprocess
5
+
6
+
7
+ def detect_gpu_vram_mb() -> dict[str, int]:
8
+ """Detect total GPU VRAM via nvidia-smi. Returns empty dict if not available."""
9
+ if not shutil.which("nvidia-smi"):
10
+ return {}
11
+ try:
12
+ result = subprocess.run(
13
+ ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
14
+ capture_output=True,
15
+ text=True,
16
+ timeout=10,
17
+ )
18
+ if result.returncode != 0:
19
+ return {}
20
+ total = 0
21
+ for line in result.stdout.strip().splitlines():
22
+ total += int(line.strip())
23
+ if total > 0:
24
+ return {"vram_mb": total}
25
+ except (subprocess.TimeoutExpired, ValueError, OSError):
26
+ pass
27
+ return {}
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from uuid import uuid4
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ def _utcnow() -> datetime:
10
+ return datetime.now(timezone.utc)
11
+
12
+
13
+ def _new_id() -> str:
14
+ return uuid4().hex[:12]
15
+
16
+
17
+ class Lease(BaseModel):
18
+ id: str = Field(default_factory=_new_id)
19
+ pid: int
20
+ resources: dict[str, int]
21
+ priority: int = 0
22
+ acquired_at: datetime = Field(default_factory=_utcnow)
23
+ estimated_seconds: int | None = None
24
+ reclaimable: bool = False
25
+ reclaim_requested: bool = False
26
+ label: str | None = None
27
+
28
+ model_config = {"extra": "forbid"}
29
+
30
+
31
+ class QueueEntry(BaseModel):
32
+ id: str = Field(default_factory=_new_id)
33
+ pid: int
34
+ resources: dict[str, int]
35
+ priority: int = 0
36
+ queued_at: datetime = Field(default_factory=_utcnow)
37
+ label: str | None = None
38
+
39
+ model_config = {"extra": "forbid"}
40
+
41
+
42
+ class State(BaseModel):
43
+ version: int = 1
44
+ resources: dict[str, int] = Field(default_factory=dict)
45
+ leases: list[Lease] = Field(default_factory=list)
46
+ queue: list[QueueEntry] = Field(default_factory=list)
47
+
48
+ model_config = {"extra": "forbid"}
49
+
50
+ def available(self) -> dict[str, int]:
51
+ used: dict[str, int] = {}
52
+ for lease in self.leases:
53
+ for key, val in lease.resources.items():
54
+ used[key] = used.get(key, 0) + val
55
+ return {key: total - used.get(key, 0) for key, total in self.resources.items()}
56
+
57
+ def can_fit(self, resources: dict[str, int]) -> bool:
58
+ avail = self.available()
59
+ return all(avail.get(key, 0) >= val for key, val in resources.items())
60
+
61
+ def reclaimable_for(self, resources: dict[str, int]) -> list[Lease]:
62
+ """Return reclaimable leases that would need to be reclaimed to fit the request."""
63
+ avail = self.available()
64
+ shortfall: dict[str, int] = {}
65
+ for key, val in resources.items():
66
+ deficit = val - avail.get(key, 0)
67
+ if deficit > 0:
68
+ shortfall[key] = deficit
69
+ if not shortfall:
70
+ return []
71
+ candidates = [ls for ls in self.leases if ls.reclaimable and not ls.reclaim_requested]
72
+ candidates.sort(key=lambda ls: ls.priority)
73
+ result: list[Lease] = []
74
+ remaining = dict(shortfall)
75
+ for lease in candidates:
76
+ if not remaining:
77
+ break
78
+ helps = False
79
+ for key in list(remaining):
80
+ contrib = lease.resources.get(key, 0)
81
+ if contrib > 0:
82
+ helps = True
83
+ remaining[key] -= contrib
84
+ if remaining[key] <= 0:
85
+ del remaining[key]
86
+ if helps:
87
+ result.append(lease)
88
+ if remaining:
89
+ return []
90
+ return result
91
+
92
+
93
+ class PoolStatus(BaseModel):
94
+ resources: dict[str, int]
95
+ available: dict[str, int]
96
+ leases: list[Lease]
97
+ queue: list[QueueEntry]
@@ -0,0 +1,247 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import time
6
+ from collections.abc import Generator
7
+ from contextlib import contextmanager
8
+ from pathlib import Path
9
+
10
+ from reslock.models import Lease, PoolStatus, QueueEntry, State
11
+ from reslock.state import DEFAULT_STATE_PATH, ensure_state_file, read_state, transact
12
+
13
+
14
+ class LeaseHandle:
15
+ """Handle for an acquired lease, used to update or release it."""
16
+
17
+ def __init__(self, lease: Lease, pool: ResourcePool) -> None:
18
+ self._lease = lease
19
+ self._pool = pool
20
+ self._released = False
21
+
22
+ @property
23
+ def id(self) -> str:
24
+ return self._lease.id
25
+
26
+ @property
27
+ def reclaim_requested(self) -> bool:
28
+ """Check the state file for whether reclaim has been requested."""
29
+ state = read_state(self._pool._path)
30
+ for lease in state.leases:
31
+ if lease.id == self._lease.id:
32
+ self._lease = lease
33
+ return lease.reclaim_requested
34
+ return True # lease gone = treat as reclaimed
35
+
36
+ def wait_for_reclaim(self, poll_interval: float = 0.5) -> None:
37
+ """Block until reclaim is requested for this lease."""
38
+ while not self.reclaim_requested:
39
+ time.sleep(poll_interval)
40
+
41
+ async def wait_for_reclaim_async(self, poll_interval: float = 0.5) -> None:
42
+ """Async wait until reclaim is requested for this lease."""
43
+ while not self.reclaim_requested:
44
+ await asyncio.sleep(poll_interval)
45
+
46
+ def update(self, estimated_seconds: int | None = None) -> None:
47
+ def _update(state: State) -> None:
48
+ for lease in state.leases:
49
+ if lease.id == self._lease.id:
50
+ if estimated_seconds is not None:
51
+ lease.estimated_seconds = estimated_seconds
52
+ break
53
+
54
+ transact(self._pool._path, _update)
55
+
56
+ def release(self) -> None:
57
+ if self._released:
58
+ return
59
+ self._released = True
60
+
61
+ def _release(state: State) -> None:
62
+ state.leases = [ls for ls in state.leases if ls.id != self._lease.id]
63
+
64
+ transact(self._pool._path, _release)
65
+
66
+
67
+ class ResourcePool:
68
+ def __init__(self, path: str | Path | None = None) -> None:
69
+ self._path = Path(path) if path else DEFAULT_STATE_PATH
70
+ ensure_state_file(self._path)
71
+
72
+ @contextmanager
73
+ def acquire(
74
+ self,
75
+ *,
76
+ priority: int = 0,
77
+ estimated_seconds: int | None = None,
78
+ reclaimable: bool = False,
79
+ label: str | None = None,
80
+ poll_interval: float = 0.25,
81
+ **resources: int,
82
+ ) -> Generator[LeaseHandle, None, None]:
83
+ handle = self._acquire_blocking(
84
+ resources=resources,
85
+ priority=priority,
86
+ estimated_seconds=estimated_seconds,
87
+ reclaimable=reclaimable,
88
+ label=label,
89
+ poll_interval=poll_interval,
90
+ )
91
+ try:
92
+ yield handle
93
+ finally:
94
+ handle.release()
95
+
96
+ async def acquire_async(
97
+ self,
98
+ *,
99
+ priority: int = 0,
100
+ estimated_seconds: int | None = None,
101
+ reclaimable: bool = False,
102
+ label: str | None = None,
103
+ poll_interval: float = 0.25,
104
+ **resources: int,
105
+ ) -> LeaseHandle:
106
+ queue_id: str | None = None
107
+ pid = os.getpid()
108
+
109
+ def _enqueue(state: State) -> str:
110
+ entry = QueueEntry(pid=pid, resources=resources, priority=priority, label=label)
111
+ state.queue.append(entry)
112
+ return entry.id
113
+
114
+ queue_id = transact(self._path, _enqueue)
115
+
116
+ try:
117
+ while True:
118
+ handle = self._try_promote(
119
+ queue_id, resources, priority, estimated_seconds, reclaimable, label
120
+ )
121
+ if handle is not None:
122
+ return handle
123
+ await asyncio.sleep(poll_interval)
124
+ except BaseException:
125
+ self._remove_from_queue(queue_id)
126
+ raise
127
+
128
+ def try_acquire(
129
+ self,
130
+ *,
131
+ priority: int = 0,
132
+ estimated_seconds: int | None = None,
133
+ reclaimable: bool = False,
134
+ label: str | None = None,
135
+ **resources: int,
136
+ ) -> LeaseHandle | None:
137
+ pid = os.getpid()
138
+ result: list[LeaseHandle] = []
139
+
140
+ def _try(state: State) -> None:
141
+ if not state.can_fit(resources):
142
+ return
143
+ lease = Lease(
144
+ pid=pid,
145
+ resources=resources,
146
+ priority=priority,
147
+ estimated_seconds=estimated_seconds,
148
+ reclaimable=reclaimable,
149
+ label=label,
150
+ )
151
+ state.leases.append(lease)
152
+ result.append(LeaseHandle(lease, self))
153
+
154
+ transact(self._path, _try)
155
+ return result[0] if result else None
156
+
157
+ def status(self) -> PoolStatus:
158
+ state = read_state(self._path)
159
+ return PoolStatus(
160
+ resources=state.resources,
161
+ available=state.available(),
162
+ leases=state.leases,
163
+ queue=state.queue,
164
+ )
165
+
166
+ def _acquire_blocking(
167
+ self,
168
+ resources: dict[str, int],
169
+ priority: int,
170
+ estimated_seconds: int | None,
171
+ reclaimable: bool,
172
+ label: str | None,
173
+ poll_interval: float,
174
+ ) -> LeaseHandle:
175
+ pid = os.getpid()
176
+
177
+ def _enqueue(state: State) -> str:
178
+ entry = QueueEntry(pid=pid, resources=resources, priority=priority, label=label)
179
+ state.queue.append(entry)
180
+ return entry.id
181
+
182
+ queue_id = transact(self._path, _enqueue)
183
+
184
+ try:
185
+ while True:
186
+ handle = self._try_promote(
187
+ queue_id, resources, priority, estimated_seconds, reclaimable, label
188
+ )
189
+ if handle is not None:
190
+ return handle
191
+ time.sleep(poll_interval)
192
+ except BaseException:
193
+ self._remove_from_queue(queue_id)
194
+ raise
195
+
196
+ def _try_promote(
197
+ self,
198
+ queue_id: str,
199
+ resources: dict[str, int],
200
+ priority: int,
201
+ estimated_seconds: int | None,
202
+ reclaimable: bool,
203
+ label: str | None,
204
+ ) -> LeaseHandle | None:
205
+ pid = os.getpid()
206
+ result: list[LeaseHandle] = []
207
+
208
+ def _promote(state: State) -> None:
209
+ # Check if we're still in queue
210
+ in_queue = any(e.id == queue_id for e in state.queue)
211
+ if not in_queue:
212
+ return
213
+
214
+ # Check if higher-priority waiters should go first
215
+ for entry in state.queue:
216
+ if entry.id == queue_id:
217
+ break
218
+ if entry.priority > priority and state.can_fit(entry.resources):
219
+ return # higher priority waiter can fit, let them go first
220
+
221
+ if state.can_fit(resources):
222
+ lease = Lease(
223
+ pid=pid,
224
+ resources=resources,
225
+ priority=priority,
226
+ estimated_seconds=estimated_seconds,
227
+ reclaimable=reclaimable,
228
+ label=label,
229
+ )
230
+ state.leases.append(lease)
231
+ state.queue = [e for e in state.queue if e.id != queue_id]
232
+ result.append(LeaseHandle(lease, self))
233
+ else:
234
+ # Try reclaiming
235
+ to_reclaim = state.reclaimable_for(resources)
236
+ if to_reclaim:
237
+ for lease in to_reclaim:
238
+ lease.reclaim_requested = True
239
+
240
+ transact(self._path, _promote)
241
+ return result[0] if result else None
242
+
243
+ def _remove_from_queue(self, queue_id: str) -> None:
244
+ def _remove(state: State) -> None:
245
+ state.queue = [e for e in state.queue if e.id != queue_id]
246
+
247
+ transact(self._path, _remove)
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import os
5
+ import tempfile
6
+ from collections.abc import Callable
7
+ from pathlib import Path
8
+ from typing import TypeVar
9
+
10
+ import portalocker
11
+
12
+ from reslock.cleanup import remove_dead_processes
13
+ from reslock.models import State
14
+
15
+ T = TypeVar("T")
16
+
17
+ DEFAULT_STATE_PATH = Path.home() / ".reslock" / "state.json"
18
+
19
+
20
+ def ensure_state_file(path: Path) -> None:
21
+ path.parent.mkdir(parents=True, exist_ok=True)
22
+ if not path.exists():
23
+ path.write_text(State().model_dump_json(indent=2))
24
+
25
+
26
+ def read_state(path: Path) -> State:
27
+ with portalocker.Lock(str(path), "r", timeout=5) as fh:
28
+ data = fh.read()
29
+ return State.model_validate_json(data)
30
+
31
+
32
+ def write_state(path: Path, state: State) -> None:
33
+ fd, tmp = tempfile.mkstemp(dir=path.parent, suffix=".tmp")
34
+ try:
35
+ with os.fdopen(fd, "w") as f:
36
+ f.write(state.model_dump_json(indent=2))
37
+ os.replace(tmp, path)
38
+ except BaseException:
39
+ with contextlib.suppress(OSError):
40
+ os.unlink(tmp)
41
+ raise
42
+
43
+
44
+ def transact(path: Path, fn: Callable[[State], T]) -> T:
45
+ """Atomically read, modify, and write the state file under an exclusive lock.
46
+
47
+ The callable `fn` receives the current state (with dead processes cleaned up)
48
+ and may mutate it. The modified state is written back. The return value of `fn`
49
+ is returned to the caller.
50
+ """
51
+ with portalocker.Lock(str(path), "r+", timeout=5) as fh:
52
+ data = fh.read()
53
+ state = State.model_validate_json(data)
54
+ remove_dead_processes(state)
55
+ result = fn(state)
56
+ new_data = state.model_dump_json(indent=2)
57
+ fh.seek(0)
58
+ fh.truncate()
59
+ fh.write(new_data)
60
+ return result