reslock 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reslock-0.1.0/.gitignore +31 -0
- reslock-0.1.0/PKG-INFO +138 -0
- reslock-0.1.0/README.md +110 -0
- reslock-0.1.0/pyproject.toml +83 -0
- reslock-0.1.0/src/reslock/__init__.py +8 -0
- reslock-0.1.0/src/reslock/cleanup.py +20 -0
- reslock-0.1.0/src/reslock/cli.py +313 -0
- reslock-0.1.0/src/reslock/detect.py +27 -0
- reslock-0.1.0/src/reslock/models.py +97 -0
- reslock-0.1.0/src/reslock/pool.py +247 -0
- reslock-0.1.0/src/reslock/state.py +60 -0
reslock-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
ENV/
|
|
16
|
+
|
|
17
|
+
# IDE
|
|
18
|
+
.idea/
|
|
19
|
+
.vscode/
|
|
20
|
+
*.swp
|
|
21
|
+
|
|
22
|
+
# Testing
|
|
23
|
+
.pytest_cache/
|
|
24
|
+
.coverage
|
|
25
|
+
htmlcov/
|
|
26
|
+
.pyright/
|
|
27
|
+
.ruff_cache/
|
|
28
|
+
|
|
29
|
+
# OS
|
|
30
|
+
.DS_Store
|
|
31
|
+
Thumbs.db
|
reslock-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: reslock
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU) across processes
|
|
5
|
+
Project-URL: Homepage, https://github.com/mo22/reslock
|
|
6
|
+
Project-URL: Repository, https://github.com/mo22/reslock
|
|
7
|
+
Author-email: Moritz Möller <mm@mxs.de>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: gpu,lock,resource,scheduling,vram
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: click>=8.0.0
|
|
19
|
+
Requires-Dist: portalocker>=2.0.0
|
|
20
|
+
Requires-Dist: pydantic>=2.0.0
|
|
21
|
+
Requires-Dist: rich>=13.0.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pyright>=1.1.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.4.0; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# reslock
|
|
30
|
+
|
|
31
|
+
Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU cores) across multiple processes on a single machine.
|
|
32
|
+
|
|
33
|
+
## Problem
|
|
34
|
+
|
|
35
|
+
Multiple GPU-consuming processes (llama.cpp, whisper, vLLM, training jobs) compete for limited resources — especially VRAM. Without coordination, they OOM or degrade each other.
|
|
36
|
+
|
|
37
|
+
## How it works
|
|
38
|
+
|
|
39
|
+
- All coordination happens through a single JSON state file — no daemon required
|
|
40
|
+
- Processes coordinate via file locking (held only during reads/writes, not for lease duration)
|
|
41
|
+
- Dead processes are automatically cleaned up via PID checking
|
|
42
|
+
- Priority queue determines which waiter gets resources next
|
|
43
|
+
- Reclaimable leases allow loaded models to be preempted by higher-priority work
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install reslock
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Python API
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from reslock import ResourcePool
|
|
55
|
+
|
|
56
|
+
pool = ResourcePool() # uses ~/.reslock/state.json
|
|
57
|
+
|
|
58
|
+
# Context manager — blocks until resources are available
|
|
59
|
+
with pool.acquire(vram_mb=4000, priority=5, label="whisper") as lease:
|
|
60
|
+
run_whisper(audio_file)
|
|
61
|
+
|
|
62
|
+
# Non-blocking
|
|
63
|
+
lease = pool.try_acquire(vram_mb=4000)
|
|
64
|
+
if lease:
|
|
65
|
+
try:
|
|
66
|
+
do_work()
|
|
67
|
+
finally:
|
|
68
|
+
lease.release()
|
|
69
|
+
|
|
70
|
+
# Async
|
|
71
|
+
async with pool.acquire_async(vram_mb=4000) as lease:
|
|
72
|
+
await run_inference()
|
|
73
|
+
|
|
74
|
+
# Reclaimable lease — can be preempted
|
|
75
|
+
lease = pool.acquire(vram_mb=4000, reclaimable=True)
|
|
76
|
+
load_model()
|
|
77
|
+
# ... later:
|
|
78
|
+
if lease.reclaim_requested:
|
|
79
|
+
unload_model()
|
|
80
|
+
lease.release()
|
|
81
|
+
|
|
82
|
+
# Check status
|
|
83
|
+
status = pool.status()
|
|
84
|
+
print(status.available) # free resources
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## CLI
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Initialize (auto-detects GPU)
|
|
91
|
+
reslock init
|
|
92
|
+
|
|
93
|
+
# Set resources manually
|
|
94
|
+
reslock set vram_mb 24000
|
|
95
|
+
reslock set gpu_slots 2
|
|
96
|
+
|
|
97
|
+
# Show status
|
|
98
|
+
reslock status
|
|
99
|
+
|
|
100
|
+
# Run a command with reserved resources
|
|
101
|
+
reslock run --vram 4G llama-cli --model model.gguf
|
|
102
|
+
reslock run --vram 8G --priority 10 --label "llama-70b" llama-cli ...
|
|
103
|
+
reslock run --vram 4G --ram 16G --cpu 4 python train.py
|
|
104
|
+
|
|
105
|
+
# Manage leases
|
|
106
|
+
reslock list
|
|
107
|
+
reslock release abc-123
|
|
108
|
+
reslock release --label whisper
|
|
109
|
+
reslock reset
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## How resources work
|
|
113
|
+
|
|
114
|
+
Resources are named quantities with a total capacity. Resource names are arbitrary strings — define whatever you need:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
reslock set vram_mb 24000
|
|
118
|
+
reslock set ram_mb 65536
|
|
119
|
+
reslock set gpu_slots 2
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Leases reserve amounts from these pools. When a lease is released (or its process dies), the resources become available again.
|
|
123
|
+
|
|
124
|
+
## Priority queue
|
|
125
|
+
|
|
126
|
+
When resources aren't immediately available, requests enter a priority queue. Higher priority number = more urgent. Ties are broken by arrival time (FIFO).
|
|
127
|
+
|
|
128
|
+
## Reclaimable leases
|
|
129
|
+
|
|
130
|
+
A process can mark its lease as **reclaimable** — "I'm using this, but can give it up if needed." When a higher-priority request needs those resources, `reclaim_requested` is set to `True`. The lease holder cooperates by releasing.
|
|
131
|
+
|
|
132
|
+
## Development
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
uv venv && uv pip install -e ".[dev]"
|
|
136
|
+
pytest
|
|
137
|
+
ruff check src/ tests/
|
|
138
|
+
```
|
reslock-0.1.0/README.md
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# reslock
|
|
2
|
+
|
|
3
|
+
Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU cores) across multiple processes on a single machine.
|
|
4
|
+
|
|
5
|
+
## Problem
|
|
6
|
+
|
|
7
|
+
Multiple GPU-consuming processes (llama.cpp, whisper, vLLM, training jobs) compete for limited resources — especially VRAM. Without coordination, they OOM or degrade each other.
|
|
8
|
+
|
|
9
|
+
## How it works
|
|
10
|
+
|
|
11
|
+
- All coordination happens through a single JSON state file — no daemon required
|
|
12
|
+
- Processes coordinate via file locking (held only during reads/writes, not for lease duration)
|
|
13
|
+
- Dead processes are automatically cleaned up via PID checking
|
|
14
|
+
- Priority queue determines which waiter gets resources next
|
|
15
|
+
- Reclaimable leases allow loaded models to be preempted by higher-priority work
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install reslock
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Python API
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from reslock import ResourcePool
|
|
27
|
+
|
|
28
|
+
pool = ResourcePool() # uses ~/.reslock/state.json
|
|
29
|
+
|
|
30
|
+
# Context manager — blocks until resources are available
|
|
31
|
+
with pool.acquire(vram_mb=4000, priority=5, label="whisper") as lease:
|
|
32
|
+
run_whisper(audio_file)
|
|
33
|
+
|
|
34
|
+
# Non-blocking
|
|
35
|
+
lease = pool.try_acquire(vram_mb=4000)
|
|
36
|
+
if lease:
|
|
37
|
+
try:
|
|
38
|
+
do_work()
|
|
39
|
+
finally:
|
|
40
|
+
lease.release()
|
|
41
|
+
|
|
42
|
+
# Async
|
|
43
|
+
async with pool.acquire_async(vram_mb=4000) as lease:
|
|
44
|
+
await run_inference()
|
|
45
|
+
|
|
46
|
+
# Reclaimable lease — can be preempted
|
|
47
|
+
lease = pool.acquire(vram_mb=4000, reclaimable=True)
|
|
48
|
+
load_model()
|
|
49
|
+
# ... later:
|
|
50
|
+
if lease.reclaim_requested:
|
|
51
|
+
unload_model()
|
|
52
|
+
lease.release()
|
|
53
|
+
|
|
54
|
+
# Check status
|
|
55
|
+
status = pool.status()
|
|
56
|
+
print(status.available) # free resources
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## CLI
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Initialize (auto-detects GPU)
|
|
63
|
+
reslock init
|
|
64
|
+
|
|
65
|
+
# Set resources manually
|
|
66
|
+
reslock set vram_mb 24000
|
|
67
|
+
reslock set gpu_slots 2
|
|
68
|
+
|
|
69
|
+
# Show status
|
|
70
|
+
reslock status
|
|
71
|
+
|
|
72
|
+
# Run a command with reserved resources
|
|
73
|
+
reslock run --vram 4G llama-cli --model model.gguf
|
|
74
|
+
reslock run --vram 8G --priority 10 --label "llama-70b" llama-cli ...
|
|
75
|
+
reslock run --vram 4G --ram 16G --cpu 4 python train.py
|
|
76
|
+
|
|
77
|
+
# Manage leases
|
|
78
|
+
reslock list
|
|
79
|
+
reslock release abc-123
|
|
80
|
+
reslock release --label whisper
|
|
81
|
+
reslock reset
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## How resources work
|
|
85
|
+
|
|
86
|
+
Resources are named quantities with a total capacity. Resource names are arbitrary strings — define whatever you need:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
reslock set vram_mb 24000
|
|
90
|
+
reslock set ram_mb 65536
|
|
91
|
+
reslock set gpu_slots 2
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Leases reserve amounts from these pools. When a lease is released (or its process dies), the resources become available again.
|
|
95
|
+
|
|
96
|
+
## Priority queue
|
|
97
|
+
|
|
98
|
+
When resources aren't immediately available, requests enter a priority queue. Higher priority number = more urgent. Ties are broken by arrival time (FIFO).
|
|
99
|
+
|
|
100
|
+
## Reclaimable leases
|
|
101
|
+
|
|
102
|
+
A process can mark its lease as **reclaimable** — "I'm using this, but can give it up if needed." When a higher-priority request needs those resources, `reclaim_requested` is set to `True`. The lease holder cooperates by releasing.
|
|
103
|
+
|
|
104
|
+
## Development
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uv venv && uv pip install -e ".[dev]"
|
|
108
|
+
pytest
|
|
109
|
+
ruff check src/ tests/
|
|
110
|
+
```
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "reslock"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Resource lock manager for coordinating shared system resources (GPU VRAM, RAM, CPU) across processes"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
authors = [{ name = "Moritz Möller", email = "mm@mxs.de" }]
|
|
9
|
+
keywords = ["gpu", "vram", "resource", "lock", "scheduling"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"portalocker>=2.0.0",
|
|
21
|
+
"click>=8.0.0",
|
|
22
|
+
"rich>=13.0.0",
|
|
23
|
+
"pydantic>=2.0.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
dev = [
|
|
28
|
+
"pytest>=8.0.0",
|
|
29
|
+
"pytest-asyncio>=0.23.0",
|
|
30
|
+
"ruff>=0.4.0",
|
|
31
|
+
"pyright>=1.1.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
reslock = "reslock.cli:main"
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/mo22/reslock"
|
|
39
|
+
Repository = "https://github.com/mo22/reslock"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["hatchling"]
|
|
43
|
+
build-backend = "hatchling.build"
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.wheel]
|
|
46
|
+
packages = ["src/reslock"]
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.sdist]
|
|
49
|
+
include = ["src/reslock"]
|
|
50
|
+
|
|
51
|
+
[tool.ruff]
|
|
52
|
+
target-version = "py310"
|
|
53
|
+
line-length = 100
|
|
54
|
+
src = ["src"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint]
|
|
57
|
+
select = [
|
|
58
|
+
"E", # pycodestyle errors
|
|
59
|
+
"W", # pycodestyle warnings
|
|
60
|
+
"F", # pyflakes
|
|
61
|
+
"I", # isort
|
|
62
|
+
"B", # flake8-bugbear
|
|
63
|
+
"C4", # flake8-comprehensions
|
|
64
|
+
"UP", # pyupgrade
|
|
65
|
+
"ARG", # flake8-unused-arguments
|
|
66
|
+
"SIM", # flake8-simplify
|
|
67
|
+
]
|
|
68
|
+
ignore = [
|
|
69
|
+
"E501", # line too long (handled by formatter)
|
|
70
|
+
"B008", # function calls in argument defaults (needed for Click)
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
[tool.ruff.lint.isort]
|
|
74
|
+
known-first-party = ["reslock"]
|
|
75
|
+
|
|
76
|
+
[tool.pyright]
|
|
77
|
+
pythonVersion = "3.10"
|
|
78
|
+
typeCheckingMode = "strict"
|
|
79
|
+
ignore = ["**/node_modules", "**/__pycache__"]
|
|
80
|
+
|
|
81
|
+
[tool.pytest.ini_options]
|
|
82
|
+
asyncio_mode = "auto"
|
|
83
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""reslock — Resource lock manager for coordinating shared system resources."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from reslock.models import Lease, PoolStatus, QueueEntry, State
|
|
6
|
+
from reslock.pool import LeaseHandle, ResourcePool
|
|
7
|
+
|
|
8
|
+
__all__ = ["Lease", "LeaseHandle", "PoolStatus", "QueueEntry", "ResourcePool", "State"]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from reslock.models import State
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_pid_alive(pid: int) -> bool:
|
|
9
|
+
try:
|
|
10
|
+
os.kill(pid, 0)
|
|
11
|
+
except ProcessLookupError:
|
|
12
|
+
return False
|
|
13
|
+
except PermissionError:
|
|
14
|
+
return True
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def remove_dead_processes(state: State) -> None:
|
|
19
|
+
state.leases = [lease for lease in state.leases if is_pid_alive(lease.pid)]
|
|
20
|
+
state.queue = [e for e in state.queue if is_pid_alive(e.pid)]
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""CLI for reslock."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import signal
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import click
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
from rich.table import Table
|
|
17
|
+
|
|
18
|
+
from reslock.detect import detect_gpu_vram_mb
|
|
19
|
+
from reslock.models import State
|
|
20
|
+
from reslock.pool import ResourcePool
|
|
21
|
+
from reslock.state import DEFAULT_STATE_PATH, ensure_state_file, transact
|
|
22
|
+
|
|
23
|
+
console = Console()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _parse_size(value: str) -> int:
|
|
27
|
+
"""Parse a size string like '4G', '500M', or plain number (MB)."""
|
|
28
|
+
m = re.match(r"^(\d+(?:\.\d+)?)\s*([gGmM]?)$", value)
|
|
29
|
+
if not m:
|
|
30
|
+
raise click.BadParameter(f"Invalid size: {value}")
|
|
31
|
+
num = float(m.group(1))
|
|
32
|
+
unit = m.group(2).upper()
|
|
33
|
+
if unit == "G":
|
|
34
|
+
return int(num * 1024)
|
|
35
|
+
return int(num)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@click.group()
|
|
39
|
+
@click.version_option(package_name="reslock")
|
|
40
|
+
def main() -> None:
|
|
41
|
+
"""Resource lock manager for coordinating shared system resources."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@main.command()
|
|
45
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
46
|
+
def init(state: Path | None) -> None:
|
|
47
|
+
"""Initialize reslock state file, auto-detecting GPU if available."""
|
|
48
|
+
path = state or DEFAULT_STATE_PATH
|
|
49
|
+
ensure_state_file(path)
|
|
50
|
+
|
|
51
|
+
gpu = detect_gpu_vram_mb()
|
|
52
|
+
|
|
53
|
+
def _init(st: State) -> None:
|
|
54
|
+
for key, val in gpu.items():
|
|
55
|
+
if key not in st.resources:
|
|
56
|
+
st.resources[key] = val
|
|
57
|
+
|
|
58
|
+
transact(path, _init)
|
|
59
|
+
|
|
60
|
+
if gpu:
|
|
61
|
+
console.print(f"[green]Detected:[/green] vram_mb={gpu.get('vram_mb', 0)}")
|
|
62
|
+
console.print(f"[green]State file:[/green] {path}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@main.command(name="set")
|
|
66
|
+
@click.argument("resource")
|
|
67
|
+
@click.argument("value", type=int)
|
|
68
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
69
|
+
def set_resource(resource: str, value: int, state: Path | None) -> None:
|
|
70
|
+
"""Set a resource capacity (e.g., reslock set vram_mb 24000)."""
|
|
71
|
+
path = state or DEFAULT_STATE_PATH
|
|
72
|
+
ensure_state_file(path)
|
|
73
|
+
|
|
74
|
+
def _set(st: State) -> None:
|
|
75
|
+
st.resources[resource] = value
|
|
76
|
+
|
|
77
|
+
transact(path, _set)
|
|
78
|
+
console.print(f"[green]Set[/green] {resource} = {value}")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@main.command()
|
|
82
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
83
|
+
def status(state: Path | None) -> None:
|
|
84
|
+
"""Show current resource status, leases, and queue."""
|
|
85
|
+
path = state or DEFAULT_STATE_PATH
|
|
86
|
+
ensure_state_file(path)
|
|
87
|
+
pool = ResourcePool(path)
|
|
88
|
+
st = pool.status()
|
|
89
|
+
|
|
90
|
+
if st.resources:
|
|
91
|
+
table = Table(title="Resources")
|
|
92
|
+
table.add_column("Resource", style="cyan")
|
|
93
|
+
table.add_column("Total", justify="right")
|
|
94
|
+
table.add_column("Used", justify="right")
|
|
95
|
+
table.add_column("Free", justify="right")
|
|
96
|
+
for key, total in st.resources.items():
|
|
97
|
+
free = st.available.get(key, 0)
|
|
98
|
+
used = total - free
|
|
99
|
+
table.add_row(key, str(total), str(used), str(free))
|
|
100
|
+
console.print(table)
|
|
101
|
+
else:
|
|
102
|
+
console.print("[dim]No resources configured. Run 'reslock init' or 'reslock set'.[/dim]")
|
|
103
|
+
|
|
104
|
+
if st.leases:
|
|
105
|
+
table = Table(title=f"Leases ({len(st.leases)} active)")
|
|
106
|
+
table.add_column("ID", style="cyan")
|
|
107
|
+
table.add_column("PID")
|
|
108
|
+
table.add_column("Resources")
|
|
109
|
+
table.add_column("Priority", justify="right")
|
|
110
|
+
table.add_column("Label")
|
|
111
|
+
table.add_column("Flags")
|
|
112
|
+
table.add_column("Age")
|
|
113
|
+
now = datetime.now(timezone.utc)
|
|
114
|
+
for lease in st.leases:
|
|
115
|
+
age = now - lease.acquired_at
|
|
116
|
+
mins = int(age.total_seconds()) // 60
|
|
117
|
+
age_str = f"{mins}m ago" if mins > 0 else f"{int(age.total_seconds())}s ago"
|
|
118
|
+
res_str = ", ".join(f"{k}={v}" for k, v in lease.resources.items())
|
|
119
|
+
flags: list[str] = []
|
|
120
|
+
if lease.reclaimable:
|
|
121
|
+
flags.append("reclaimable")
|
|
122
|
+
if lease.reclaim_requested:
|
|
123
|
+
flags.append("reclaim_requested")
|
|
124
|
+
table.add_row(
|
|
125
|
+
lease.id,
|
|
126
|
+
str(lease.pid),
|
|
127
|
+
res_str,
|
|
128
|
+
str(lease.priority),
|
|
129
|
+
lease.label or "",
|
|
130
|
+
" ".join(flags),
|
|
131
|
+
age_str,
|
|
132
|
+
)
|
|
133
|
+
console.print(table)
|
|
134
|
+
|
|
135
|
+
if st.queue:
|
|
136
|
+
table = Table(title=f"Queue ({len(st.queue)} waiting)")
|
|
137
|
+
table.add_column("ID", style="cyan")
|
|
138
|
+
table.add_column("PID")
|
|
139
|
+
table.add_column("Resources")
|
|
140
|
+
table.add_column("Priority", justify="right")
|
|
141
|
+
table.add_column("Label")
|
|
142
|
+
table.add_column("Queued")
|
|
143
|
+
now = datetime.now(timezone.utc)
|
|
144
|
+
for e in st.queue:
|
|
145
|
+
age = now - e.queued_at
|
|
146
|
+
secs = int(age.total_seconds())
|
|
147
|
+
queued_str = f"{secs // 60}m ago" if secs >= 60 else f"{secs}s ago"
|
|
148
|
+
res_str = ", ".join(f"{k}={v}" for k, v in e.resources.items())
|
|
149
|
+
table.add_row(e.id, str(e.pid), res_str, str(e.priority), e.label or "", queued_str)
|
|
150
|
+
console.print(table)
|
|
151
|
+
|
|
152
|
+
if not st.leases and not st.queue:
|
|
153
|
+
console.print("[dim]No active leases or queued requests.[/dim]")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@main.command()
|
|
157
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
158
|
+
def list_(state: Path | None) -> None:
|
|
159
|
+
"""List active leases."""
|
|
160
|
+
path = state or DEFAULT_STATE_PATH
|
|
161
|
+
ensure_state_file(path)
|
|
162
|
+
pool = ResourcePool(path)
|
|
163
|
+
st = pool.status()
|
|
164
|
+
if not st.leases:
|
|
165
|
+
console.print("[dim]No active leases.[/dim]")
|
|
166
|
+
return
|
|
167
|
+
for lease in st.leases:
|
|
168
|
+
res_str = ", ".join(f"{k}={v}" for k, v in lease.resources.items())
|
|
169
|
+
lbl = f" label={lease.label}" if lease.label else ""
|
|
170
|
+
console.print(f" {lease.id} pid={lease.pid} {res_str} prio={lease.priority}{lbl}")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# Register "list" as the CLI command name
|
|
174
|
+
list_.__name__ = "list"
|
|
175
|
+
main.add_command(list_, "list")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@main.command()
|
|
179
|
+
@click.argument("lease_id", required=False)
|
|
180
|
+
@click.option("--label", "-l", default=None, help="Release by label")
|
|
181
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
182
|
+
def release(lease_id: str | None, label: str | None, state: Path | None) -> None:
|
|
183
|
+
"""Release a lease by ID or label."""
|
|
184
|
+
if not lease_id and not label:
|
|
185
|
+
raise click.UsageError("Provide a lease ID or --label")
|
|
186
|
+
path = state or DEFAULT_STATE_PATH
|
|
187
|
+
ensure_state_file(path)
|
|
188
|
+
released: list[str] = []
|
|
189
|
+
|
|
190
|
+
def _release(st: State) -> None:
|
|
191
|
+
before = len(st.leases)
|
|
192
|
+
if lease_id:
|
|
193
|
+
st.leases = [ls for ls in st.leases if ls.id != lease_id]
|
|
194
|
+
elif label:
|
|
195
|
+
st.leases = [ls for ls in st.leases if ls.label != label]
|
|
196
|
+
released.extend(["x"] * (before - len(st.leases)))
|
|
197
|
+
|
|
198
|
+
transact(path, _release)
|
|
199
|
+
if released:
|
|
200
|
+
console.print(f"[green]Released {len(released)} lease(s).[/green]")
|
|
201
|
+
else:
|
|
202
|
+
console.print("[yellow]No matching lease found.[/yellow]")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@main.command()
|
|
206
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
207
|
+
def reset(state: Path | None) -> None:
|
|
208
|
+
"""Clear all state (leases, queue)."""
|
|
209
|
+
path = state or DEFAULT_STATE_PATH
|
|
210
|
+
ensure_state_file(path)
|
|
211
|
+
|
|
212
|
+
def _reset(st: State) -> None:
|
|
213
|
+
st.leases.clear()
|
|
214
|
+
st.queue.clear()
|
|
215
|
+
|
|
216
|
+
transact(path, _reset)
|
|
217
|
+
console.print("[green]State cleared.[/green]")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@main.command()
|
|
221
|
+
def schema() -> None:
|
|
222
|
+
"""Print the JSON schema for the state file."""
|
|
223
|
+
import json
|
|
224
|
+
|
|
225
|
+
click.echo(json.dumps(State.model_json_schema(), indent=2))
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@main.command()
|
|
229
|
+
@click.option("--vram", default=None, help="VRAM to reserve (e.g., 4G, 500M)")
|
|
230
|
+
@click.option("--ram", default=None, help="RAM to reserve (e.g., 16G)")
|
|
231
|
+
@click.option("--cpu", type=int, default=None, help="CPU cores to reserve")
|
|
232
|
+
@click.option("--priority", "-p", type=int, default=0, help="Priority (higher = more urgent)")
|
|
233
|
+
@click.option("--label", "-l", default=None, help="Label for this lease")
|
|
234
|
+
@click.option(
|
|
235
|
+
"--reclaimable", is_flag=True, help="Allow lease to be reclaimed by higher-priority requests"
|
|
236
|
+
)
|
|
237
|
+
@click.option(
|
|
238
|
+
"--reclaim-signal",
|
|
239
|
+
default="SIGTERM",
|
|
240
|
+
help="Signal to send to the child process when reclaim is requested (default: SIGTERM)",
|
|
241
|
+
)
|
|
242
|
+
@click.option("--state", "-s", type=click.Path(path_type=Path), default=None)
|
|
243
|
+
@click.argument("command", nargs=-1, required=True)
|
|
244
|
+
def run(
|
|
245
|
+
vram: str | None,
|
|
246
|
+
ram: str | None,
|
|
247
|
+
cpu: int | None,
|
|
248
|
+
priority: int,
|
|
249
|
+
label: str | None,
|
|
250
|
+
reclaimable: bool,
|
|
251
|
+
reclaim_signal: str,
|
|
252
|
+
state: Path | None,
|
|
253
|
+
command: tuple[str, ...],
|
|
254
|
+
) -> None:
|
|
255
|
+
"""Reserve resources and run a command.
|
|
256
|
+
|
|
257
|
+
With --reclaimable, the lease can be reclaimed by higher-priority requests.
|
|
258
|
+
When reclaim is requested, the specified signal is sent to the child process.
|
|
259
|
+
"""
|
|
260
|
+
resources: dict[str, int] = {}
|
|
261
|
+
if vram:
|
|
262
|
+
resources["vram_mb"] = _parse_size(vram)
|
|
263
|
+
if ram:
|
|
264
|
+
resources["ram_mb"] = _parse_size(ram)
|
|
265
|
+
if cpu:
|
|
266
|
+
resources["cpu_cores"] = cpu
|
|
267
|
+
|
|
268
|
+
if not resources:
|
|
269
|
+
raise click.UsageError("Specify at least one resource (--vram, --ram, --cpu)")
|
|
270
|
+
|
|
271
|
+
sig = getattr(signal, reclaim_signal, None)
|
|
272
|
+
if sig is None:
|
|
273
|
+
raise click.BadParameter(f"Unknown signal: {reclaim_signal}")
|
|
274
|
+
|
|
275
|
+
path = state or DEFAULT_STATE_PATH
|
|
276
|
+
ensure_state_file(path)
|
|
277
|
+
pool = ResourcePool(path)
|
|
278
|
+
|
|
279
|
+
console.print(f"[dim]Waiting for resources: {resources}...[/dim]")
|
|
280
|
+
with pool.acquire(
|
|
281
|
+
priority=priority, label=label, reclaimable=reclaimable, **resources
|
|
282
|
+
) as lease:
|
|
283
|
+
console.print(f"[green]Acquired lease {lease.id}[/green]")
|
|
284
|
+
proc = subprocess.Popen(list(command))
|
|
285
|
+
|
|
286
|
+
def _sighandler(signum: int, _frame: object) -> None:
|
|
287
|
+
proc.send_signal(signum)
|
|
288
|
+
|
|
289
|
+
signal.signal(signal.SIGTERM, _sighandler)
|
|
290
|
+
signal.signal(signal.SIGINT, _sighandler)
|
|
291
|
+
|
|
292
|
+
if reclaimable:
|
|
293
|
+
|
|
294
|
+
def _watch_reclaim() -> None:
|
|
295
|
+
while proc.poll() is None:
|
|
296
|
+
if lease.reclaim_requested:
|
|
297
|
+
console.print(
|
|
298
|
+
f"[yellow]Reclaim requested, sending {reclaim_signal} to child[/yellow]"
|
|
299
|
+
)
|
|
300
|
+
proc.send_signal(sig)
|
|
301
|
+
return
|
|
302
|
+
time.sleep(0.5)
|
|
303
|
+
|
|
304
|
+
threading.Thread(target=_watch_reclaim, daemon=True).start()
|
|
305
|
+
|
|
306
|
+
returncode = proc.wait()
|
|
307
|
+
|
|
308
|
+
console.print(f"[dim]Lease released. Process exited with code {returncode}.[/dim]")
|
|
309
|
+
sys.exit(returncode)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
if __name__ == "__main__":
|
|
313
|
+
main()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_gpu_vram_mb() -> dict[str, int]:
|
|
8
|
+
"""Detect total GPU VRAM via nvidia-smi. Returns empty dict if not available."""
|
|
9
|
+
if not shutil.which("nvidia-smi"):
|
|
10
|
+
return {}
|
|
11
|
+
try:
|
|
12
|
+
result = subprocess.run(
|
|
13
|
+
["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
|
|
14
|
+
capture_output=True,
|
|
15
|
+
text=True,
|
|
16
|
+
timeout=10,
|
|
17
|
+
)
|
|
18
|
+
if result.returncode != 0:
|
|
19
|
+
return {}
|
|
20
|
+
total = 0
|
|
21
|
+
for line in result.stdout.strip().splitlines():
|
|
22
|
+
total += int(line.strip())
|
|
23
|
+
if total > 0:
|
|
24
|
+
return {"vram_mb": total}
|
|
25
|
+
except (subprocess.TimeoutExpired, ValueError, OSError):
|
|
26
|
+
pass
|
|
27
|
+
return {}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _utcnow() -> datetime:
|
|
10
|
+
return datetime.now(timezone.utc)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _new_id() -> str:
|
|
14
|
+
return uuid4().hex[:12]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Lease(BaseModel):
|
|
18
|
+
id: str = Field(default_factory=_new_id)
|
|
19
|
+
pid: int
|
|
20
|
+
resources: dict[str, int]
|
|
21
|
+
priority: int = 0
|
|
22
|
+
acquired_at: datetime = Field(default_factory=_utcnow)
|
|
23
|
+
estimated_seconds: int | None = None
|
|
24
|
+
reclaimable: bool = False
|
|
25
|
+
reclaim_requested: bool = False
|
|
26
|
+
label: str | None = None
|
|
27
|
+
|
|
28
|
+
model_config = {"extra": "forbid"}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class QueueEntry(BaseModel):
|
|
32
|
+
id: str = Field(default_factory=_new_id)
|
|
33
|
+
pid: int
|
|
34
|
+
resources: dict[str, int]
|
|
35
|
+
priority: int = 0
|
|
36
|
+
queued_at: datetime = Field(default_factory=_utcnow)
|
|
37
|
+
label: str | None = None
|
|
38
|
+
|
|
39
|
+
model_config = {"extra": "forbid"}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class State(BaseModel):
|
|
43
|
+
version: int = 1
|
|
44
|
+
resources: dict[str, int] = Field(default_factory=dict)
|
|
45
|
+
leases: list[Lease] = Field(default_factory=list)
|
|
46
|
+
queue: list[QueueEntry] = Field(default_factory=list)
|
|
47
|
+
|
|
48
|
+
model_config = {"extra": "forbid"}
|
|
49
|
+
|
|
50
|
+
def available(self) -> dict[str, int]:
|
|
51
|
+
used: dict[str, int] = {}
|
|
52
|
+
for lease in self.leases:
|
|
53
|
+
for key, val in lease.resources.items():
|
|
54
|
+
used[key] = used.get(key, 0) + val
|
|
55
|
+
return {key: total - used.get(key, 0) for key, total in self.resources.items()}
|
|
56
|
+
|
|
57
|
+
def can_fit(self, resources: dict[str, int]) -> bool:
|
|
58
|
+
avail = self.available()
|
|
59
|
+
return all(avail.get(key, 0) >= val for key, val in resources.items())
|
|
60
|
+
|
|
61
|
+
def reclaimable_for(self, resources: dict[str, int]) -> list[Lease]:
|
|
62
|
+
"""Return reclaimable leases that would need to be reclaimed to fit the request."""
|
|
63
|
+
avail = self.available()
|
|
64
|
+
shortfall: dict[str, int] = {}
|
|
65
|
+
for key, val in resources.items():
|
|
66
|
+
deficit = val - avail.get(key, 0)
|
|
67
|
+
if deficit > 0:
|
|
68
|
+
shortfall[key] = deficit
|
|
69
|
+
if not shortfall:
|
|
70
|
+
return []
|
|
71
|
+
candidates = [ls for ls in self.leases if ls.reclaimable and not ls.reclaim_requested]
|
|
72
|
+
candidates.sort(key=lambda ls: ls.priority)
|
|
73
|
+
result: list[Lease] = []
|
|
74
|
+
remaining = dict(shortfall)
|
|
75
|
+
for lease in candidates:
|
|
76
|
+
if not remaining:
|
|
77
|
+
break
|
|
78
|
+
helps = False
|
|
79
|
+
for key in list(remaining):
|
|
80
|
+
contrib = lease.resources.get(key, 0)
|
|
81
|
+
if contrib > 0:
|
|
82
|
+
helps = True
|
|
83
|
+
remaining[key] -= contrib
|
|
84
|
+
if remaining[key] <= 0:
|
|
85
|
+
del remaining[key]
|
|
86
|
+
if helps:
|
|
87
|
+
result.append(lease)
|
|
88
|
+
if remaining:
|
|
89
|
+
return []
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class PoolStatus(BaseModel):
|
|
94
|
+
resources: dict[str, int]
|
|
95
|
+
available: dict[str, int]
|
|
96
|
+
leases: list[Lease]
|
|
97
|
+
queue: list[QueueEntry]
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import Generator
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from reslock.models import Lease, PoolStatus, QueueEntry, State
|
|
11
|
+
from reslock.state import DEFAULT_STATE_PATH, ensure_state_file, read_state, transact
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LeaseHandle:
|
|
15
|
+
"""Handle for an acquired lease, used to update or release it."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, lease: Lease, pool: ResourcePool) -> None:
|
|
18
|
+
self._lease = lease
|
|
19
|
+
self._pool = pool
|
|
20
|
+
self._released = False
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def id(self) -> str:
|
|
24
|
+
return self._lease.id
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def reclaim_requested(self) -> bool:
|
|
28
|
+
"""Check the state file for whether reclaim has been requested."""
|
|
29
|
+
state = read_state(self._pool._path)
|
|
30
|
+
for lease in state.leases:
|
|
31
|
+
if lease.id == self._lease.id:
|
|
32
|
+
self._lease = lease
|
|
33
|
+
return lease.reclaim_requested
|
|
34
|
+
return True # lease gone = treat as reclaimed
|
|
35
|
+
|
|
36
|
+
def wait_for_reclaim(self, poll_interval: float = 0.5) -> None:
|
|
37
|
+
"""Block until reclaim is requested for this lease."""
|
|
38
|
+
while not self.reclaim_requested:
|
|
39
|
+
time.sleep(poll_interval)
|
|
40
|
+
|
|
41
|
+
async def wait_for_reclaim_async(self, poll_interval: float = 0.5) -> None:
|
|
42
|
+
"""Async wait until reclaim is requested for this lease."""
|
|
43
|
+
while not self.reclaim_requested:
|
|
44
|
+
await asyncio.sleep(poll_interval)
|
|
45
|
+
|
|
46
|
+
def update(self, estimated_seconds: int | None = None) -> None:
|
|
47
|
+
def _update(state: State) -> None:
|
|
48
|
+
for lease in state.leases:
|
|
49
|
+
if lease.id == self._lease.id:
|
|
50
|
+
if estimated_seconds is not None:
|
|
51
|
+
lease.estimated_seconds = estimated_seconds
|
|
52
|
+
break
|
|
53
|
+
|
|
54
|
+
transact(self._pool._path, _update)
|
|
55
|
+
|
|
56
|
+
def release(self) -> None:
|
|
57
|
+
if self._released:
|
|
58
|
+
return
|
|
59
|
+
self._released = True
|
|
60
|
+
|
|
61
|
+
def _release(state: State) -> None:
|
|
62
|
+
state.leases = [ls for ls in state.leases if ls.id != self._lease.id]
|
|
63
|
+
|
|
64
|
+
transact(self._pool._path, _release)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ResourcePool:
|
|
68
|
+
def __init__(self, path: str | Path | None = None) -> None:
|
|
69
|
+
self._path = Path(path) if path else DEFAULT_STATE_PATH
|
|
70
|
+
ensure_state_file(self._path)
|
|
71
|
+
|
|
72
|
+
@contextmanager
|
|
73
|
+
def acquire(
|
|
74
|
+
self,
|
|
75
|
+
*,
|
|
76
|
+
priority: int = 0,
|
|
77
|
+
estimated_seconds: int | None = None,
|
|
78
|
+
reclaimable: bool = False,
|
|
79
|
+
label: str | None = None,
|
|
80
|
+
poll_interval: float = 0.25,
|
|
81
|
+
**resources: int,
|
|
82
|
+
) -> Generator[LeaseHandle, None, None]:
|
|
83
|
+
handle = self._acquire_blocking(
|
|
84
|
+
resources=resources,
|
|
85
|
+
priority=priority,
|
|
86
|
+
estimated_seconds=estimated_seconds,
|
|
87
|
+
reclaimable=reclaimable,
|
|
88
|
+
label=label,
|
|
89
|
+
poll_interval=poll_interval,
|
|
90
|
+
)
|
|
91
|
+
try:
|
|
92
|
+
yield handle
|
|
93
|
+
finally:
|
|
94
|
+
handle.release()
|
|
95
|
+
|
|
96
|
+
async def acquire_async(
|
|
97
|
+
self,
|
|
98
|
+
*,
|
|
99
|
+
priority: int = 0,
|
|
100
|
+
estimated_seconds: int | None = None,
|
|
101
|
+
reclaimable: bool = False,
|
|
102
|
+
label: str | None = None,
|
|
103
|
+
poll_interval: float = 0.25,
|
|
104
|
+
**resources: int,
|
|
105
|
+
) -> LeaseHandle:
|
|
106
|
+
queue_id: str | None = None
|
|
107
|
+
pid = os.getpid()
|
|
108
|
+
|
|
109
|
+
def _enqueue(state: State) -> str:
|
|
110
|
+
entry = QueueEntry(pid=pid, resources=resources, priority=priority, label=label)
|
|
111
|
+
state.queue.append(entry)
|
|
112
|
+
return entry.id
|
|
113
|
+
|
|
114
|
+
queue_id = transact(self._path, _enqueue)
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
while True:
|
|
118
|
+
handle = self._try_promote(
|
|
119
|
+
queue_id, resources, priority, estimated_seconds, reclaimable, label
|
|
120
|
+
)
|
|
121
|
+
if handle is not None:
|
|
122
|
+
return handle
|
|
123
|
+
await asyncio.sleep(poll_interval)
|
|
124
|
+
except BaseException:
|
|
125
|
+
self._remove_from_queue(queue_id)
|
|
126
|
+
raise
|
|
127
|
+
|
|
128
|
+
def try_acquire(
|
|
129
|
+
self,
|
|
130
|
+
*,
|
|
131
|
+
priority: int = 0,
|
|
132
|
+
estimated_seconds: int | None = None,
|
|
133
|
+
reclaimable: bool = False,
|
|
134
|
+
label: str | None = None,
|
|
135
|
+
**resources: int,
|
|
136
|
+
) -> LeaseHandle | None:
|
|
137
|
+
pid = os.getpid()
|
|
138
|
+
result: list[LeaseHandle] = []
|
|
139
|
+
|
|
140
|
+
def _try(state: State) -> None:
|
|
141
|
+
if not state.can_fit(resources):
|
|
142
|
+
return
|
|
143
|
+
lease = Lease(
|
|
144
|
+
pid=pid,
|
|
145
|
+
resources=resources,
|
|
146
|
+
priority=priority,
|
|
147
|
+
estimated_seconds=estimated_seconds,
|
|
148
|
+
reclaimable=reclaimable,
|
|
149
|
+
label=label,
|
|
150
|
+
)
|
|
151
|
+
state.leases.append(lease)
|
|
152
|
+
result.append(LeaseHandle(lease, self))
|
|
153
|
+
|
|
154
|
+
transact(self._path, _try)
|
|
155
|
+
return result[0] if result else None
|
|
156
|
+
|
|
157
|
+
def status(self) -> PoolStatus:
|
|
158
|
+
state = read_state(self._path)
|
|
159
|
+
return PoolStatus(
|
|
160
|
+
resources=state.resources,
|
|
161
|
+
available=state.available(),
|
|
162
|
+
leases=state.leases,
|
|
163
|
+
queue=state.queue,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def _acquire_blocking(
|
|
167
|
+
self,
|
|
168
|
+
resources: dict[str, int],
|
|
169
|
+
priority: int,
|
|
170
|
+
estimated_seconds: int | None,
|
|
171
|
+
reclaimable: bool,
|
|
172
|
+
label: str | None,
|
|
173
|
+
poll_interval: float,
|
|
174
|
+
) -> LeaseHandle:
|
|
175
|
+
pid = os.getpid()
|
|
176
|
+
|
|
177
|
+
def _enqueue(state: State) -> str:
|
|
178
|
+
entry = QueueEntry(pid=pid, resources=resources, priority=priority, label=label)
|
|
179
|
+
state.queue.append(entry)
|
|
180
|
+
return entry.id
|
|
181
|
+
|
|
182
|
+
queue_id = transact(self._path, _enqueue)
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
while True:
|
|
186
|
+
handle = self._try_promote(
|
|
187
|
+
queue_id, resources, priority, estimated_seconds, reclaimable, label
|
|
188
|
+
)
|
|
189
|
+
if handle is not None:
|
|
190
|
+
return handle
|
|
191
|
+
time.sleep(poll_interval)
|
|
192
|
+
except BaseException:
|
|
193
|
+
self._remove_from_queue(queue_id)
|
|
194
|
+
raise
|
|
195
|
+
|
|
196
|
+
def _try_promote(
|
|
197
|
+
self,
|
|
198
|
+
queue_id: str,
|
|
199
|
+
resources: dict[str, int],
|
|
200
|
+
priority: int,
|
|
201
|
+
estimated_seconds: int | None,
|
|
202
|
+
reclaimable: bool,
|
|
203
|
+
label: str | None,
|
|
204
|
+
) -> LeaseHandle | None:
|
|
205
|
+
pid = os.getpid()
|
|
206
|
+
result: list[LeaseHandle] = []
|
|
207
|
+
|
|
208
|
+
def _promote(state: State) -> None:
|
|
209
|
+
# Check if we're still in queue
|
|
210
|
+
in_queue = any(e.id == queue_id for e in state.queue)
|
|
211
|
+
if not in_queue:
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
# Check if higher-priority waiters should go first
|
|
215
|
+
for entry in state.queue:
|
|
216
|
+
if entry.id == queue_id:
|
|
217
|
+
break
|
|
218
|
+
if entry.priority > priority and state.can_fit(entry.resources):
|
|
219
|
+
return # higher priority waiter can fit, let them go first
|
|
220
|
+
|
|
221
|
+
if state.can_fit(resources):
|
|
222
|
+
lease = Lease(
|
|
223
|
+
pid=pid,
|
|
224
|
+
resources=resources,
|
|
225
|
+
priority=priority,
|
|
226
|
+
estimated_seconds=estimated_seconds,
|
|
227
|
+
reclaimable=reclaimable,
|
|
228
|
+
label=label,
|
|
229
|
+
)
|
|
230
|
+
state.leases.append(lease)
|
|
231
|
+
state.queue = [e for e in state.queue if e.id != queue_id]
|
|
232
|
+
result.append(LeaseHandle(lease, self))
|
|
233
|
+
else:
|
|
234
|
+
# Try reclaiming
|
|
235
|
+
to_reclaim = state.reclaimable_for(resources)
|
|
236
|
+
if to_reclaim:
|
|
237
|
+
for lease in to_reclaim:
|
|
238
|
+
lease.reclaim_requested = True
|
|
239
|
+
|
|
240
|
+
transact(self._path, _promote)
|
|
241
|
+
return result[0] if result else None
|
|
242
|
+
|
|
243
|
+
def _remove_from_queue(self, queue_id: str) -> None:
|
|
244
|
+
def _remove(state: State) -> None:
|
|
245
|
+
state.queue = [e for e in state.queue if e.id != queue_id]
|
|
246
|
+
|
|
247
|
+
transact(self._path, _remove)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TypeVar
|
|
9
|
+
|
|
10
|
+
import portalocker
|
|
11
|
+
|
|
12
|
+
from reslock.cleanup import remove_dead_processes
|
|
13
|
+
from reslock.models import State
|
|
14
|
+
|
|
15
|
+
T = TypeVar("T")
|
|
16
|
+
|
|
17
|
+
DEFAULT_STATE_PATH = Path.home() / ".reslock" / "state.json"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def ensure_state_file(path: Path) -> None:
|
|
21
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
if not path.exists():
|
|
23
|
+
path.write_text(State().model_dump_json(indent=2))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def read_state(path: Path) -> State:
|
|
27
|
+
with portalocker.Lock(str(path), "r", timeout=5) as fh:
|
|
28
|
+
data = fh.read()
|
|
29
|
+
return State.model_validate_json(data)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def write_state(path: Path, state: State) -> None:
|
|
33
|
+
fd, tmp = tempfile.mkstemp(dir=path.parent, suffix=".tmp")
|
|
34
|
+
try:
|
|
35
|
+
with os.fdopen(fd, "w") as f:
|
|
36
|
+
f.write(state.model_dump_json(indent=2))
|
|
37
|
+
os.replace(tmp, path)
|
|
38
|
+
except BaseException:
|
|
39
|
+
with contextlib.suppress(OSError):
|
|
40
|
+
os.unlink(tmp)
|
|
41
|
+
raise
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def transact(path: Path, fn: Callable[[State], T]) -> T:
|
|
45
|
+
"""Atomically read, modify, and write the state file under an exclusive lock.
|
|
46
|
+
|
|
47
|
+
The callable `fn` receives the current state (with dead processes cleaned up)
|
|
48
|
+
and may mutate it. The modified state is written back. The return value of `fn`
|
|
49
|
+
is returned to the caller.
|
|
50
|
+
"""
|
|
51
|
+
with portalocker.Lock(str(path), "r+", timeout=5) as fh:
|
|
52
|
+
data = fh.read()
|
|
53
|
+
state = State.model_validate_json(data)
|
|
54
|
+
remove_dead_processes(state)
|
|
55
|
+
result = fn(state)
|
|
56
|
+
new_data = state.model_dump_json(indent=2)
|
|
57
|
+
fh.seek(0)
|
|
58
|
+
fh.truncate()
|
|
59
|
+
fh.write(new_data)
|
|
60
|
+
return result
|