experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/tokens.py
CHANGED
|
@@ -3,27 +3,31 @@ a computational resource (e.g. number of launched jobs, etc.)
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
6
8
|
import sys
|
|
7
9
|
from pathlib import Path
|
|
10
|
+
import threading
|
|
8
11
|
import time
|
|
9
|
-
import
|
|
12
|
+
from typing import Dict, Type
|
|
10
13
|
|
|
11
14
|
from omegaconf import DictConfig
|
|
15
|
+
|
|
12
16
|
from experimaestro.core.objects import Config
|
|
13
|
-
import fasteners
|
|
14
|
-
import threading
|
|
15
|
-
import os.path
|
|
16
|
-
from watchdog.events import FileSystemEventHandler
|
|
17
|
-
from typing import Dict
|
|
18
17
|
from experimaestro.launcherfinder.base import TokenConfiguration
|
|
19
|
-
|
|
20
18
|
from experimaestro.launcherfinder.registry import LauncherRegistry
|
|
21
19
|
|
|
22
|
-
from .
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
from .locking import (
|
|
21
|
+
DynamicDependencyLock,
|
|
22
|
+
DynamicLockFile,
|
|
23
|
+
JobDependencyLock,
|
|
24
|
+
Lock,
|
|
25
|
+
LockError,
|
|
26
|
+
TrackedDynamicResource,
|
|
27
|
+
)
|
|
28
|
+
from .dynamic import DynamicDependency
|
|
29
|
+
from .scheduler.dependencies import Resource
|
|
25
30
|
import logging
|
|
26
|
-
import json
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
logger = logging.getLogger("xpm.tokens")
|
|
@@ -35,10 +39,45 @@ class Token(Resource):
|
|
|
35
39
|
available: int
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
|
|
42
|
+
# =============================================================================
|
|
43
|
+
# File-based counter token
|
|
44
|
+
# =============================================================================
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class CounterTokenJobLock(JobDependencyLock):
|
|
48
|
+
"""Job-side lock for counter tokens.
|
|
49
|
+
|
|
50
|
+
Inherits from JobDependencyLock to participate in the dynamic lock lifecycle.
|
|
51
|
+
On release, deletes the token lock file created by the scheduler.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, data: dict):
|
|
55
|
+
self.token_path = Path(data["token_path"])
|
|
56
|
+
self.count = data["count"]
|
|
57
|
+
self.name = data["name"]
|
|
58
|
+
# Set lock_file_path for base class release() to delete
|
|
59
|
+
self.lock_file_path = Path(data["lock_file_path"])
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CounterTokenLock(DynamicDependencyLock):
|
|
63
|
+
"""Scheduler-side lock for counter token dependency.
|
|
64
|
+
|
|
65
|
+
Inherits from DynamicDependencyLock to participate in the dynamic lock lifecycle.
|
|
66
|
+
Manages token acquisition/release through the CounterToken resource.
|
|
67
|
+
|
|
68
|
+
On serialization, passes lock file path to the job process so it can
|
|
69
|
+
delete the lock file on release.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
dependency: "CounterTokenDependency"
|
|
73
|
+
|
|
39
74
|
def __init__(self, dependency: "CounterTokenDependency"):
|
|
40
|
-
super().__init__()
|
|
41
|
-
|
|
75
|
+
super().__init__(dependency)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def lock_folder(self) -> Path:
|
|
79
|
+
"""Path to the token lock folder."""
|
|
80
|
+
return self.dependency.token.lock_folder
|
|
42
81
|
|
|
43
82
|
def _acquire(self):
|
|
44
83
|
self.dependency.token.acquire(self.dependency)
|
|
@@ -49,6 +88,24 @@ class CounterTokenLock(Lock):
|
|
|
49
88
|
def __str__(self):
|
|
50
89
|
return "Lock(%s)" % self.dependency
|
|
51
90
|
|
|
91
|
+
def to_json(self) -> dict:
|
|
92
|
+
"""Serialize lock for job process."""
|
|
93
|
+
data = super().to_json()
|
|
94
|
+
data.update(
|
|
95
|
+
{
|
|
96
|
+
"token_path": str(self.dependency.token.path),
|
|
97
|
+
"count": self.dependency.count,
|
|
98
|
+
"name": self.dependency.token.name,
|
|
99
|
+
"lock_file_path": str(self.lock_file_path),
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
return data
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_json(cls, data: dict) -> CounterTokenJobLock:
|
|
106
|
+
"""Create job-side lock from serialized data."""
|
|
107
|
+
return CounterTokenJobLock(data)
|
|
108
|
+
|
|
52
109
|
|
|
53
110
|
class CounterTokenDependency(DynamicDependency):
|
|
54
111
|
"""A dependency onto a token (dynamic - availability can change)"""
|
|
@@ -58,201 +115,110 @@ class CounterTokenDependency(DynamicDependency):
|
|
|
58
115
|
self._token = token
|
|
59
116
|
self.count = count
|
|
60
117
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
return f"{self.target.identifier}.token"
|
|
65
|
-
|
|
66
|
-
async def aio_lock(self, timeout: float = 0) -> "Lock":
|
|
67
|
-
"""Acquire lock on token with event-driven waiting
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
timeout: Timeout in seconds (0 = wait indefinitely)
|
|
71
|
-
|
|
72
|
-
Returns:
|
|
73
|
-
Lock object
|
|
74
|
-
|
|
75
|
-
Raises:
|
|
76
|
-
LockError: If lock cannot be acquired within timeout
|
|
77
|
-
"""
|
|
78
|
-
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
79
|
-
import time
|
|
80
|
-
|
|
81
|
-
start_time = time.time()
|
|
82
|
-
|
|
83
|
-
while True:
|
|
84
|
-
try:
|
|
85
|
-
lock = CounterTokenLock(self)
|
|
86
|
-
lock.acquire()
|
|
87
|
-
return lock
|
|
88
|
-
except LockError:
|
|
89
|
-
# Wait for token availability notification
|
|
90
|
-
def wait_for_available():
|
|
91
|
-
with self.token.available_condition:
|
|
92
|
-
# Calculate remaining timeout
|
|
93
|
-
if timeout == 0:
|
|
94
|
-
wait_timeout = None # Wait indefinitely
|
|
95
|
-
else:
|
|
96
|
-
elapsed = time.time() - start_time
|
|
97
|
-
if elapsed >= timeout:
|
|
98
|
-
return False # Timeout exceeded
|
|
99
|
-
wait_timeout = timeout - elapsed
|
|
100
|
-
|
|
101
|
-
# Wait for notification
|
|
102
|
-
return self.token.available_condition.wait(timeout=wait_timeout)
|
|
103
|
-
|
|
104
|
-
# Wait in a thread (since condition is threading-based)
|
|
105
|
-
result = await asyncThreadcheck(
|
|
106
|
-
"token availability", wait_for_available
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# If wait returned False, we timed out
|
|
110
|
-
if result is False:
|
|
111
|
-
raise LockError("Timeout waiting for tokens")
|
|
112
|
-
|
|
113
|
-
# Otherwise, loop back to try acquiring again
|
|
118
|
+
def _create_lock(self) -> "Lock":
|
|
119
|
+
"""Create a counter token lock for this dependency."""
|
|
120
|
+
return CounterTokenLock(self)
|
|
114
121
|
|
|
115
122
|
@property
|
|
116
123
|
def token(self):
|
|
117
124
|
return self._token
|
|
118
125
|
|
|
119
126
|
|
|
120
|
-
class
|
|
121
|
-
"""
|
|
127
|
+
class TokenLockFile(DynamicLockFile):
|
|
128
|
+
"""Lock file for counter tokens.
|
|
122
129
|
|
|
123
|
-
The token file
|
|
124
|
-
|
|
130
|
+
The token file stores JSON with:
|
|
131
|
+
- job_uri: Reference to the job holding the lock
|
|
132
|
+
- information: {"count": number_of_tokens}
|
|
125
133
|
|
|
126
|
-
|
|
127
|
-
|
|
134
|
+
Also supports reading old line-based format for backward compatibility:
|
|
135
|
+
- Line 1: count
|
|
136
|
+
- Line 2: job_uri
|
|
128
137
|
"""
|
|
129
138
|
|
|
139
|
+
count: int
|
|
140
|
+
|
|
130
141
|
def __init__(self, path: Path):
|
|
131
|
-
|
|
142
|
+
"""Load token file from disk, supporting both JSON and old format."""
|
|
143
|
+
self.path = path
|
|
144
|
+
self.job_uri = None
|
|
132
145
|
self.count = 0
|
|
133
|
-
self.uri = None
|
|
134
146
|
|
|
135
147
|
retries = 0
|
|
136
148
|
while retries < 5:
|
|
137
149
|
retries += 1
|
|
138
150
|
try:
|
|
139
|
-
self.path = path
|
|
140
151
|
with path.open("rt") as fp:
|
|
141
|
-
|
|
142
|
-
|
|
152
|
+
content = fp.read().strip()
|
|
153
|
+
if content.startswith("{"):
|
|
154
|
+
# New JSON format
|
|
155
|
+
data = json.loads(content)
|
|
156
|
+
self.job_uri = data.get("job_uri")
|
|
157
|
+
info = data.get("information", {})
|
|
158
|
+
self.count = info.get("count", 0)
|
|
159
|
+
else:
|
|
160
|
+
# Old line-based format: count, uri
|
|
161
|
+
lines = content.split("\n")
|
|
162
|
+
if len(lines) >= 2:
|
|
163
|
+
self.count = int(lines[0])
|
|
164
|
+
self.job_uri = lines[1]
|
|
165
|
+
break
|
|
143
166
|
except FileNotFoundError:
|
|
144
|
-
|
|
145
|
-
self.count = 0
|
|
146
|
-
self.uri = None
|
|
167
|
+
break
|
|
147
168
|
except Exception:
|
|
148
169
|
logging.exception("Error while reading %s", self.path)
|
|
149
170
|
time.sleep(0.1)
|
|
150
171
|
continue
|
|
151
172
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
173
|
+
def from_information(self, info) -> None:
|
|
174
|
+
"""Set count from information dict."""
|
|
175
|
+
if info is None:
|
|
176
|
+
# Creating a new lock file
|
|
177
|
+
self.count = 0
|
|
178
|
+
elif isinstance(info, dict):
|
|
179
|
+
self.count = info.get("count", 0)
|
|
180
|
+
else:
|
|
181
|
+
raise ValueError(f"Invalid information format: {info}")
|
|
159
182
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
self.
|
|
163
|
-
self.path = path
|
|
164
|
-
logging.debug("Writing token file %s", path)
|
|
165
|
-
with path.open("wt") as fp:
|
|
166
|
-
fp.write(f"{str(count)}\n{uri}\n")
|
|
167
|
-
return self
|
|
183
|
+
def to_information(self) -> dict:
|
|
184
|
+
"""Return count for JSON serialization."""
|
|
185
|
+
return {"count": self.count}
|
|
168
186
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
self.path.unlink()
|
|
187
|
+
@classmethod
|
|
188
|
+
def from_dependency(cls, dependency: "CounterTokenDependency") -> "TokenLockFile":
|
|
189
|
+
"""Create a token lock file from a dependency.
|
|
173
190
|
|
|
174
|
-
|
|
175
|
-
"""Watch the matching process"""
|
|
191
|
+
This is a convenience method for testing and backward compatibility.
|
|
176
192
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
return
|
|
193
|
+
Args:
|
|
194
|
+
dependency: The counter token dependency
|
|
180
195
|
|
|
181
|
-
|
|
182
|
-
|
|
196
|
+
Returns:
|
|
197
|
+
New TokenLockFile instance
|
|
198
|
+
"""
|
|
199
|
+
path = (
|
|
200
|
+
dependency._token.path / "tasks" / f"{dependency.target.relmainpath}.json"
|
|
183
201
|
)
|
|
184
|
-
path =
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
# Watch for the job
|
|
189
|
-
def run():
|
|
190
|
-
logger.debug("Locking job lock path %s", lockpath)
|
|
191
|
-
process = None
|
|
192
|
-
# Acquire the job lock - blocks if scheduler is still starting the job
|
|
193
|
-
# Once we get the lock, the job has either started or finished
|
|
194
|
-
with fasteners.InterProcessLock(lockpath):
|
|
195
|
-
if not pidpath.is_file():
|
|
196
|
-
logger.debug("Job already finished (no PID file %s)", pidpath)
|
|
197
|
-
else:
|
|
198
|
-
s = ""
|
|
199
|
-
while s == "":
|
|
200
|
-
s = pidpath.read_text()
|
|
201
|
-
|
|
202
|
-
logger.info("Loading job watcher from definition")
|
|
203
|
-
from experimaestro.connectors import Process
|
|
204
|
-
|
|
205
|
-
# FIXME: not always localhost...
|
|
206
|
-
from experimaestro.connectors.local import LocalConnector
|
|
207
|
-
|
|
208
|
-
connector = LocalConnector.instance()
|
|
209
|
-
process = Process.fromDefinition(connector, json.loads(s))
|
|
210
|
-
|
|
211
|
-
# Wait out of the lock
|
|
212
|
-
if process is not None:
|
|
213
|
-
# Process is None: process has finished
|
|
214
|
-
process.wait()
|
|
215
|
-
|
|
216
|
-
self.delete()
|
|
202
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
203
|
+
job_uri = str(dependency.target.basepath)
|
|
204
|
+
return cls.create(path, job_uri, information={"count": dependency.count})
|
|
217
205
|
|
|
218
|
-
threading.Thread(target=run).start()
|
|
219
206
|
|
|
220
|
-
|
|
221
|
-
class CounterTokenProxy(FileSystemEventHandler):
|
|
222
|
-
"""Hold a weak reference to the counter token to handle gracefully deleted
|
|
223
|
-
counter tokens"""
|
|
224
|
-
|
|
225
|
-
def __init__(self, token: "CounterToken"):
|
|
226
|
-
self._token_ref = weakref.ref(token)
|
|
227
|
-
|
|
228
|
-
def on_modified(self, event):
|
|
229
|
-
token = self._token_ref()
|
|
230
|
-
if token is not None:
|
|
231
|
-
return token.on_modified(event)
|
|
232
|
-
|
|
233
|
-
def on_deleted(self, event):
|
|
234
|
-
token = self._token_ref()
|
|
235
|
-
if token is not None:
|
|
236
|
-
return token.on_deleted(event)
|
|
237
|
-
|
|
238
|
-
def on_created(self, event):
|
|
239
|
-
token = self._token_ref()
|
|
240
|
-
if token is not None:
|
|
241
|
-
return token.on_created(event)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
class CounterToken(Token, FileSystemEventHandler):
|
|
207
|
+
class CounterToken(Token, TrackedDynamicResource):
|
|
245
208
|
"""File-based counter token
|
|
246
209
|
|
|
247
210
|
To ensure recovery (server stopped for whatever reason), we use one folder
|
|
248
|
-
per token; inside this folder:
|
|
211
|
+
per token; inside this folder (lock_folder):
|
|
249
212
|
|
|
250
|
-
-
|
|
251
|
-
-
|
|
252
|
-
-
|
|
213
|
+
- ipc.lock is used for IPC locking (from TrackedDynamicResource)
|
|
214
|
+
- informations.json contains the maximum number of tokens {"total": count}
|
|
215
|
+
- jobs/{task_id}/{identifier}.json contain job-specific lock info (count, job URI)
|
|
253
216
|
"""
|
|
254
217
|
|
|
255
|
-
|
|
218
|
+
#: Lock file class for token files
|
|
219
|
+
lock_file_class: Type[DynamicLockFile] = TokenLockFile
|
|
220
|
+
|
|
221
|
+
#: Maps token keys to CounterToken instances
|
|
256
222
|
TOKENS: Dict[str, "CounterToken"] = {}
|
|
257
223
|
|
|
258
224
|
@staticmethod
|
|
@@ -276,184 +242,114 @@ class CounterToken(Token, FileSystemEventHandler):
|
|
|
276
242
|
DictConfig({}, key_type=str, element_type=CounterConfiguration),
|
|
277
243
|
)
|
|
278
244
|
|
|
245
|
+
@property
|
|
246
|
+
def lock_folder(self) -> Path:
|
|
247
|
+
"""Path to the lock folder."""
|
|
248
|
+
return self._path
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def path(self) -> Path:
|
|
252
|
+
"""Path to the token directory (alias for lock_folder)."""
|
|
253
|
+
return self._path
|
|
254
|
+
|
|
255
|
+
def _write_informations(self, total: int) -> None:
|
|
256
|
+
"""Write token informations to disk."""
|
|
257
|
+
with self.informations_path.open("w") as f:
|
|
258
|
+
json.dump({"total": total}, f)
|
|
259
|
+
|
|
260
|
+
def _read_informations(self) -> int:
|
|
261
|
+
"""Read token total from informations file."""
|
|
262
|
+
try:
|
|
263
|
+
with self.informations_path.open("r") as f:
|
|
264
|
+
data = json.load(f)
|
|
265
|
+
return data.get("total", 0)
|
|
266
|
+
except FileNotFoundError:
|
|
267
|
+
return 0
|
|
268
|
+
|
|
279
269
|
def __init__(self, name: str, path: Path, count: int, force=True):
|
|
280
|
-
"""
|
|
270
|
+
"""Initialize a counter token.
|
|
281
271
|
|
|
282
272
|
Arguments:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
273
|
+
name: Token name
|
|
274
|
+
path: The file path of the token directory
|
|
275
|
+
count: Number of tokens (overrides previous definitions)
|
|
276
|
+
force: If the token has already been created, force to write the maximum
|
|
277
|
+
number of tokens
|
|
287
278
|
"""
|
|
288
|
-
super().__init__
|
|
279
|
+
# Store path before calling super().__init__ since lock_folder needs it
|
|
280
|
+
self._path = path
|
|
281
|
+
self.total = count
|
|
289
282
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
283
|
+
# Set the informations file if needed (before TrackedDynamicResource init)
|
|
284
|
+
path.mkdir(exist_ok=True, parents=True)
|
|
285
|
+
if force or not (path / "informations.json").is_file():
|
|
286
|
+
with (path / "informations.json").open("w") as f:
|
|
287
|
+
json.dump({"total": count}, f)
|
|
294
288
|
|
|
295
|
-
|
|
289
|
+
# Initialize base classes - this will call _update()
|
|
290
|
+
Token.__init__(self)
|
|
291
|
+
TrackedDynamicResource.__init__(self, name)
|
|
296
292
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
# Condition variable for waiting on token availability
|
|
301
|
-
self.available_condition = threading.Condition(self.lock)
|
|
293
|
+
def __str__(self):
|
|
294
|
+
return "token[{}]".format(self.name)
|
|
302
295
|
|
|
303
|
-
|
|
296
|
+
# --- TrackedDynamicResource abstract method implementations ---
|
|
304
297
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
self.total = count
|
|
310
|
-
self.infopath.write_text(str(count))
|
|
298
|
+
def _reset_state(self) -> None:
|
|
299
|
+
"""Reset available count before re-reading lock files."""
|
|
300
|
+
self.total = self._read_informations()
|
|
301
|
+
self.available = self.total
|
|
311
302
|
|
|
312
|
-
|
|
313
|
-
|
|
303
|
+
def _account_lock_file(self, lf: DynamicLockFile) -> None:
|
|
304
|
+
"""Subtract token count from available."""
|
|
305
|
+
self.available -= lf.count
|
|
314
306
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
self.
|
|
318
|
-
self.watcher = ipcom().fswatch(self.proxy, self.path, recursive=True)
|
|
319
|
-
logger.debug("Watching %s", self.watchedpath)
|
|
307
|
+
def _unaccount_lock_file(self, lf: DynamicLockFile) -> None:
|
|
308
|
+
"""Add token count back to available."""
|
|
309
|
+
self.available += lf.count
|
|
320
310
|
|
|
321
|
-
def
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
logging.debug("Removing watcher on %s", self.watchedpath)
|
|
325
|
-
ipcom().fsunwatch(self.watcher)
|
|
326
|
-
self.watcher = None
|
|
311
|
+
def is_available(self, dependency: "CounterTokenDependency") -> bool:
|
|
312
|
+
"""Check if enough tokens are available."""
|
|
313
|
+
return self.available >= dependency.count
|
|
327
314
|
|
|
328
|
-
def
|
|
329
|
-
"""
|
|
315
|
+
def _do_acquire(self, dependency: "CounterTokenDependency") -> None:
|
|
316
|
+
"""Subtract tokens from available count."""
|
|
317
|
+
self.available -= dependency.count
|
|
318
|
+
logger.debug(
|
|
319
|
+
"Token state [acquired %d]: available %d, total %d",
|
|
320
|
+
dependency.count,
|
|
321
|
+
self.available,
|
|
322
|
+
self.total,
|
|
323
|
+
)
|
|
330
324
|
|
|
331
|
-
|
|
332
|
-
"""
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
old_cache = self.cache
|
|
336
|
-
self.cache = {}
|
|
337
|
-
self.available = self.total
|
|
325
|
+
def _do_release(self, dependency: "CounterTokenDependency") -> None:
|
|
326
|
+
"""Add tokens back to available count."""
|
|
327
|
+
self.available += dependency.count
|
|
328
|
+
logging.debug("%s: available %d", self, self.available)
|
|
338
329
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
tf = TokenFile(path)
|
|
343
|
-
tf.watch()
|
|
344
|
-
logging.debug("Read token file %s (%d)", path, tf.count)
|
|
345
|
-
else:
|
|
346
|
-
logging.debug(
|
|
347
|
-
"Token file already in cache %s (%d)", path.name, tf.count
|
|
348
|
-
)
|
|
349
|
-
|
|
350
|
-
self.cache[path.name] = tf
|
|
351
|
-
self.available -= tf.count
|
|
352
|
-
logging.debug("Full token state update finished (%d available)", self.available)
|
|
330
|
+
def _get_lock_file_information(self, dependency: "CounterTokenDependency"):
|
|
331
|
+
"""Return token count for lock file."""
|
|
332
|
+
return {"count": dependency.count}
|
|
353
333
|
|
|
354
|
-
|
|
355
|
-
return "token[{}]".format(self.name)
|
|
334
|
+
# --- Token-specific event handling ---
|
|
356
335
|
|
|
357
|
-
def
|
|
336
|
+
def _handle_information_change(self) -> None:
|
|
337
|
+
"""Handle token count changes from informations.json."""
|
|
338
|
+
total = self._read_informations()
|
|
339
|
+
delta = total - self.total
|
|
340
|
+
self.total = total
|
|
341
|
+
self.available += delta
|
|
358
342
|
logger.debug(
|
|
359
|
-
"
|
|
360
|
-
|
|
361
|
-
self.
|
|
343
|
+
"Token information modified: available %d, total %d",
|
|
344
|
+
self.available,
|
|
345
|
+
self.total,
|
|
362
346
|
)
|
|
363
|
-
name = Path(event.src_path).name
|
|
364
|
-
# Name is in cache if we did not release the token ourselves
|
|
365
|
-
if name in self.cache:
|
|
366
|
-
with self.lock:
|
|
367
|
-
if name in self.cache:
|
|
368
|
-
logging.debug("Deleting %s from token cache (event)", name)
|
|
369
|
-
fc = self.cache[name]
|
|
370
|
-
del self.cache[name]
|
|
371
|
-
|
|
372
|
-
self.available += fc.count
|
|
373
|
-
logger.debug(
|
|
374
|
-
"Getting back %d tokens (%d available)",
|
|
375
|
-
fc.count,
|
|
376
|
-
self.available,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
# Notify waiting tasks that tokens are available
|
|
380
|
-
self.available_condition.notify_all()
|
|
381
|
-
|
|
382
|
-
def on_created(self, event):
|
|
383
|
-
logger.debug(
|
|
384
|
-
"Created path notification %s [watched %s]",
|
|
385
|
-
event.src_path,
|
|
386
|
-
self.watchedpath,
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
path = Path(event.src_path)
|
|
390
347
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
if path.name not in self.cache:
|
|
395
|
-
tokenfile = TokenFile(path)
|
|
396
|
-
tokenfile.watch()
|
|
397
|
-
self.cache[path.name] = tokenfile
|
|
398
|
-
except FileNotFoundError:
|
|
399
|
-
# We did not find the token file... just ignore
|
|
400
|
-
pass
|
|
401
|
-
except Exception:
|
|
402
|
-
logger.exception("Uncaught exception in on_modified handler")
|
|
403
|
-
raise
|
|
348
|
+
# Notify waiting tasks if tokens became available
|
|
349
|
+
if delta > 0:
|
|
350
|
+
self.available_condition.notify_all()
|
|
404
351
|
|
|
405
|
-
|
|
406
|
-
try:
|
|
407
|
-
logger.debug(
|
|
408
|
-
"on modified path: %s [watched %s]",
|
|
409
|
-
event.src_path,
|
|
410
|
-
self.watchedpath,
|
|
411
|
-
)
|
|
412
|
-
# logger.debug("%s", event)
|
|
413
|
-
|
|
414
|
-
path = Path(event.src_path)
|
|
415
|
-
|
|
416
|
-
if event.src_path == str(self.infopath):
|
|
417
|
-
logger.debug("Token information modified")
|
|
418
|
-
with self.lock:
|
|
419
|
-
timestamp = os.path.getmtime(self.infopath)
|
|
420
|
-
if timestamp <= self.timestamp:
|
|
421
|
-
logger.debug(
|
|
422
|
-
"Not reading token file [%f <= %f]",
|
|
423
|
-
timestamp,
|
|
424
|
-
self.timestamp,
|
|
425
|
-
)
|
|
426
|
-
return
|
|
427
|
-
|
|
428
|
-
total = int(self.infopath.read_text())
|
|
429
|
-
delta = total - self.total
|
|
430
|
-
self.total = total
|
|
431
|
-
self.available += delta
|
|
432
|
-
logger.debug(
|
|
433
|
-
"Token information modified: available %d, total %d",
|
|
434
|
-
self.available,
|
|
435
|
-
self.total,
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
# Notify waiting tasks if tokens became available
|
|
439
|
-
if delta > 0:
|
|
440
|
-
self.available_condition.notify_all()
|
|
441
|
-
|
|
442
|
-
# A modified dependency not in cache
|
|
443
|
-
elif path.name.endswith(".token") and path.name not in self.cache:
|
|
444
|
-
with self.lock:
|
|
445
|
-
if path.name not in self.cache:
|
|
446
|
-
logger.debug("Token file not in cache %s", path.name)
|
|
447
|
-
try:
|
|
448
|
-
tokenfile = TokenFile(path)
|
|
449
|
-
tokenfile.watch()
|
|
450
|
-
self.cache[path.name] = tokenfile
|
|
451
|
-
except FileNotFoundError:
|
|
452
|
-
# Well, the file did not exist anymore...
|
|
453
|
-
pass
|
|
454
|
-
except Exception:
|
|
455
|
-
logger.exception("Uncaught exception in on_modified handler")
|
|
456
|
-
raise
|
|
352
|
+
# --- Token API ---
|
|
457
353
|
|
|
458
354
|
def dependency(self, count):
|
|
459
355
|
"""Create a token dependency"""
|
|
@@ -463,51 +359,10 @@ class CounterToken(Token, FileSystemEventHandler):
|
|
|
463
359
|
"""Create a token dependency and add it to the task"""
|
|
464
360
|
return task.add_dependencies(self.dependency(count))
|
|
465
361
|
|
|
466
|
-
def acquire(self, dependency: CounterTokenDependency):
|
|
467
|
-
"""Acquire requested token"""
|
|
468
|
-
with self.lock, self.ipc_lock:
|
|
469
|
-
self._update()
|
|
470
|
-
if self.available < dependency.count:
|
|
471
|
-
logger.debug(
|
|
472
|
-
"Not enough available (%d available, %d requested)",
|
|
473
|
-
self.available,
|
|
474
|
-
dependency.count,
|
|
475
|
-
)
|
|
476
|
-
raise LockError("No token")
|
|
477
|
-
|
|
478
|
-
self.available -= dependency.count
|
|
479
|
-
|
|
480
|
-
self.cache[dependency.name] = TokenFile.create(dependency)
|
|
481
|
-
logger.debug(
|
|
482
|
-
"Token state [acquired %d]: available %d, taken %d",
|
|
483
|
-
dependency.count,
|
|
484
|
-
self.available,
|
|
485
|
-
self.total,
|
|
486
|
-
)
|
|
487
|
-
|
|
488
|
-
def release(self, dependency: CounterTokenDependency):
|
|
489
|
-
"""Release"""
|
|
490
|
-
with self.lock, self.ipc_lock:
|
|
491
|
-
self._update()
|
|
492
|
-
|
|
493
|
-
tf = self.cache.get(dependency.name, None)
|
|
494
|
-
if tf is None:
|
|
495
|
-
logging.error(
|
|
496
|
-
"Could not find the taken token for %s (%s)",
|
|
497
|
-
dependency,
|
|
498
|
-
dependency.name,
|
|
499
|
-
)
|
|
500
|
-
return
|
|
501
|
-
|
|
502
|
-
logging.debug("Deleting %s from token cache", dependency.name)
|
|
503
|
-
del self.cache[dependency.name]
|
|
504
|
-
self.available += tf.count
|
|
505
|
-
logging.debug("%s: available %d", self, self.available)
|
|
506
|
-
|
|
507
|
-
# Notify waiting tasks that tokens are available
|
|
508
|
-
self.available_condition.notify_all()
|
|
509
362
|
|
|
510
|
-
|
|
363
|
+
# =============================================================================
|
|
364
|
+
# Process level token
|
|
365
|
+
# =============================================================================
|
|
511
366
|
|
|
512
367
|
|
|
513
368
|
class ProcessCounterToken(Token):
|