experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/cli/filter.py
CHANGED
|
@@ -5,12 +5,27 @@ tags, and other attributes.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import re
|
|
8
|
-
from
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Callable, Dict, TYPE_CHECKING
|
|
9
10
|
import pyparsing as pp
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from experimaestro.scheduler.state_provider import MockJob
|
|
13
14
|
|
|
15
|
+
# Type alias for tags map: job_id -> {tag_key: tag_value}
|
|
16
|
+
TagsMap = Dict[str, Dict[str, str]]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class FilterContext:
|
|
21
|
+
"""Context for filter evaluation containing experiment-scoped data
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
tags_map: Maps job identifiers to their tags dict for the current experiment/run
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
tags_map: TagsMap = field(default_factory=dict)
|
|
28
|
+
|
|
14
29
|
|
|
15
30
|
# --- classes for processing
|
|
16
31
|
|
|
@@ -19,14 +34,16 @@ class VarExpr:
|
|
|
19
34
|
def __init__(self, values):
|
|
20
35
|
(self.varname,) = values
|
|
21
36
|
|
|
22
|
-
def get(self, job: "MockJob"):
|
|
37
|
+
def get(self, job: "MockJob", ctx: FilterContext):
|
|
23
38
|
if self.varname == "@state":
|
|
24
39
|
return job.state.name if job.state else None
|
|
25
40
|
|
|
26
41
|
if self.varname == "@name":
|
|
27
42
|
return str(job.path.parent.name)
|
|
28
43
|
|
|
29
|
-
|
|
44
|
+
# Tags are stored in JobTagModel, accessed via ctx.tags_map keyed by job identifier
|
|
45
|
+
job_tags = ctx.tags_map.get(job.identifier, {})
|
|
46
|
+
return job_tags.get(self.varname, None)
|
|
30
47
|
|
|
31
48
|
def __repr__(self):
|
|
32
49
|
return f"""VAR<{self.varname}>"""
|
|
@@ -39,8 +56,8 @@ class BaseInExpr:
|
|
|
39
56
|
|
|
40
57
|
|
|
41
58
|
class InExpr(BaseInExpr):
|
|
42
|
-
def filter(self, job: "MockJob"):
|
|
43
|
-
value = self.var.get(job)
|
|
59
|
+
def filter(self, job: "MockJob", ctx: FilterContext):
|
|
60
|
+
value = self.var.get(job, ctx)
|
|
44
61
|
return value in self.values
|
|
45
62
|
|
|
46
63
|
def __repr__(self):
|
|
@@ -48,8 +65,8 @@ class InExpr(BaseInExpr):
|
|
|
48
65
|
|
|
49
66
|
|
|
50
67
|
class NotInExpr(BaseInExpr):
|
|
51
|
-
def filter(self, job: "MockJob"):
|
|
52
|
-
value = self.var.get(job)
|
|
68
|
+
def filter(self, job: "MockJob", ctx: FilterContext):
|
|
69
|
+
value = self.var.get(job, ctx)
|
|
53
70
|
return value not in self.values
|
|
54
71
|
|
|
55
72
|
def __repr__(self):
|
|
@@ -70,8 +87,8 @@ class RegexExpr:
|
|
|
70
87
|
|
|
71
88
|
raise AssertionError()
|
|
72
89
|
|
|
73
|
-
def filter(self, job: "MockJob"):
|
|
74
|
-
value = self.var.get(job)
|
|
90
|
+
def filter(self, job: "MockJob", ctx: FilterContext):
|
|
91
|
+
value = self.var.get(job, ctx)
|
|
75
92
|
if not value:
|
|
76
93
|
return False
|
|
77
94
|
|
|
@@ -82,7 +99,7 @@ class ConstantString:
|
|
|
82
99
|
def __init__(self, tokens):
|
|
83
100
|
(self.value,) = tokens
|
|
84
101
|
|
|
85
|
-
def get(self, _job: "MockJob"):
|
|
102
|
+
def get(self, _job: "MockJob", _ctx: FilterContext):
|
|
86
103
|
return self.value
|
|
87
104
|
|
|
88
105
|
def __repr__(self):
|
|
@@ -96,8 +113,8 @@ class EqExpr:
|
|
|
96
113
|
def __repr__(self):
|
|
97
114
|
return f"""EQ[{self.var1}, {self.var2}]"""
|
|
98
115
|
|
|
99
|
-
def filter(self, job: "MockJob"):
|
|
100
|
-
return self.var1.get(job) == self.var2.get(job)
|
|
116
|
+
def filter(self, job: "MockJob", ctx: FilterContext):
|
|
117
|
+
return self.var1.get(job, ctx) == self.var2.get(job, ctx)
|
|
101
118
|
|
|
102
119
|
|
|
103
120
|
class LogicExpr:
|
|
@@ -107,11 +124,11 @@ class LogicExpr:
|
|
|
107
124
|
self.operator, self.y = tokens
|
|
108
125
|
self.x = None
|
|
109
126
|
|
|
110
|
-
def filter(self, job: "MockJob"):
|
|
127
|
+
def filter(self, job: "MockJob", ctx: FilterContext):
|
|
111
128
|
if self.operator == "and":
|
|
112
|
-
return self.y.filter(job) and self.x.filter(job)
|
|
129
|
+
return self.y.filter(job, ctx) and self.x.filter(job, ctx)
|
|
113
130
|
|
|
114
|
-
return self.y.filter(job) or self.x.filter(job)
|
|
131
|
+
return self.y.filter(job, ctx) or self.x.filter(job, ctx)
|
|
115
132
|
|
|
116
133
|
@staticmethod
|
|
117
134
|
def summary(tokens):
|
|
@@ -138,7 +155,7 @@ class LogicExpr:
|
|
|
138
155
|
|
|
139
156
|
# --- Grammar
|
|
140
157
|
|
|
141
|
-
|
|
158
|
+
lit = pp.Literal
|
|
142
159
|
|
|
143
160
|
lpar, rpar, lbra, rbra, eq, comma, pipe, tilde = map(pp.Suppress, "()[]=,|~")
|
|
144
161
|
quotedString = pp.QuotedString('"', unquoteResults=True) | pp.QuotedString(
|
|
@@ -148,7 +165,7 @@ quotedString = pp.QuotedString('"', unquoteResults=True) | pp.QuotedString(
|
|
|
148
165
|
# Tag names can contain letters, digits, underscores, and hyphens
|
|
149
166
|
# First character must be a letter, rest can include digits, underscores, hyphens
|
|
150
167
|
tag_name = pp.Word(pp.alphas, pp.alphanums + "_-")
|
|
151
|
-
var =
|
|
168
|
+
var = lit("@state") | lit("@name") | tag_name
|
|
152
169
|
var.setParseAction(VarExpr)
|
|
153
170
|
|
|
154
171
|
regexExpr = var + tilde + quotedString
|
|
@@ -161,15 +178,15 @@ eqExpr.setParseAction(EqExpr)
|
|
|
161
178
|
|
|
162
179
|
stringList = quotedString + pp.ZeroOrMore(comma + quotedString)
|
|
163
180
|
|
|
164
|
-
notInExpr = var + (pp.Suppress(
|
|
181
|
+
notInExpr = var + (pp.Suppress(lit("not in")) + lbra + stringList + rbra)
|
|
165
182
|
notInExpr.setParseAction(NotInExpr)
|
|
166
183
|
|
|
167
|
-
inExpr = var + (pp.Suppress(
|
|
184
|
+
inExpr = var + (pp.Suppress(lit("in")) + lbra + stringList + rbra)
|
|
168
185
|
inExpr.setParseAction(InExpr)
|
|
169
186
|
|
|
170
187
|
matchExpr = eqExpr | regexExpr | inExpr | notInExpr
|
|
171
188
|
|
|
172
|
-
booleanOp =
|
|
189
|
+
booleanOp = lit("and") | lit("or")
|
|
173
190
|
logicExpr = (
|
|
174
191
|
matchExpr + pp.ZeroOrMore((booleanOp + matchExpr).setParseAction(LogicExpr))
|
|
175
192
|
).setParseAction(LogicExpr.summary)
|
|
@@ -181,14 +198,22 @@ filterExpr = (
|
|
|
181
198
|
expr = (matchExpr + pp.Optional(pipe + filterExpr)).setParseAction(LogicExpr.generator)
|
|
182
199
|
|
|
183
200
|
|
|
184
|
-
def createFilter(query: str) -> Callable[["MockJob"], bool]:
|
|
201
|
+
def createFilter(query: str, ctx: FilterContext = None) -> Callable[["MockJob"], bool]:
|
|
185
202
|
"""Returns a filter function given a query string
|
|
186
203
|
|
|
187
204
|
Args:
|
|
188
205
|
query: Filter expression (e.g., '@state = "DONE" and model = "bm25"')
|
|
206
|
+
ctx: FilterContext containing tags map and other experiment-scoped data.
|
|
207
|
+
If None, an empty context is used.
|
|
189
208
|
|
|
190
209
|
Returns:
|
|
191
|
-
A callable that takes a MockJob and returns True if it matches
|
|
210
|
+
A callable that takes a MockJob and returns True if it matches.
|
|
192
211
|
"""
|
|
212
|
+
if ctx is None:
|
|
213
|
+
ctx = FilterContext()
|
|
193
214
|
(r,) = logicExpr.parseString(query, parseAll=True)
|
|
194
|
-
|
|
215
|
+
|
|
216
|
+
def filter_fn(job: "MockJob") -> bool:
|
|
217
|
+
return r.filter(job, ctx)
|
|
218
|
+
|
|
219
|
+
return filter_fn
|
experimaestro/cli/jobs.py
CHANGED
|
@@ -65,90 +65,91 @@ def process(
|
|
|
65
65
|
fullpath: Show full paths instead of short names
|
|
66
66
|
count: Limit output to N most recent jobs (0 = no limit)
|
|
67
67
|
"""
|
|
68
|
-
from .filter import createFilter
|
|
69
|
-
from experimaestro.scheduler.
|
|
68
|
+
from .filter import createFilter, FilterContext
|
|
69
|
+
from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
|
|
70
70
|
from experimaestro.scheduler import JobState
|
|
71
71
|
|
|
72
|
-
|
|
72
|
+
# Get state provider (read-only monitoring)
|
|
73
|
+
provider = WorkspaceStateProvider.get_instance(workspace.path)
|
|
73
74
|
|
|
74
|
-
# Get
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
try:
|
|
79
|
-
# Get all jobs from the database
|
|
75
|
+
# Get jobs from the database, optionally filtered by experiment
|
|
76
|
+
if experiment:
|
|
77
|
+
all_jobs = provider.get_jobs(experiment_id=experiment)
|
|
78
|
+
else:
|
|
80
79
|
all_jobs = provider.get_all_jobs()
|
|
81
80
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
81
|
+
# Load tags map for the experiment (if specified)
|
|
82
|
+
tags_map = {}
|
|
83
|
+
if experiment:
|
|
84
|
+
tags_map = provider.get_tags_map(experiment_id=experiment)
|
|
85
85
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
all_jobs = [j for j in all_jobs if _filter(j)]
|
|
86
|
+
# Create filter context with tags map
|
|
87
|
+
filter_ctx = FilterContext(tags_map=tags_map)
|
|
89
88
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
all_jobs.sort(key=lambda j: j.submittime or 0, reverse=True)
|
|
89
|
+
# Create filter function with context
|
|
90
|
+
_filter = createFilter(filter, filter_ctx) if filter else None
|
|
93
91
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
92
|
+
# Apply filter expression
|
|
93
|
+
if _filter:
|
|
94
|
+
all_jobs = [j for j in all_jobs if _filter(j)]
|
|
97
95
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
96
|
+
# Sort by submission time (most recent first)
|
|
97
|
+
# Jobs without submittime go to the end
|
|
98
|
+
all_jobs.sort(key=lambda j: j.submittime or 0, reverse=True)
|
|
99
|
+
|
|
100
|
+
# Limit to N most recent jobs if count is specified
|
|
101
|
+
if count > 0:
|
|
102
|
+
all_jobs = all_jobs[:count]
|
|
101
103
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
else:
|
|
118
|
-
cprint(f"KILL FAILED {job_str}", "red")
|
|
104
|
+
if not all_jobs:
|
|
105
|
+
cprint("No jobs found.", "yellow")
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
# Process each job
|
|
109
|
+
for job in all_jobs:
|
|
110
|
+
job_str = str(job.path) if fullpath else f"{job.task_id}/{job.identifier}"
|
|
111
|
+
|
|
112
|
+
if job.state is None or job.state == JobState.UNSCHEDULED:
|
|
113
|
+
print(colored(f"UNSCHED {job_str}", "red"), end="")
|
|
114
|
+
elif job.state.running():
|
|
115
|
+
if kill:
|
|
116
|
+
if perform:
|
|
117
|
+
if provider.kill_job(job, perform=True):
|
|
118
|
+
cprint(f"KILLED {job_str}", "light_red")
|
|
119
119
|
else:
|
|
120
|
-
cprint(f"
|
|
120
|
+
cprint(f"KILL FAILED {job_str}", "red")
|
|
121
121
|
else:
|
|
122
|
-
|
|
123
|
-
elif job.state == JobState.DONE:
|
|
124
|
-
print(colored(f"DONE {job_str}", "green"), end="")
|
|
125
|
-
elif job.state == JobState.ERROR:
|
|
126
|
-
print(colored(f"FAIL {job_str}", "red"), end="")
|
|
122
|
+
cprint(f"KILLING {job_str} (dry run)", "yellow")
|
|
127
123
|
else:
|
|
128
|
-
print(colored(f"{job.state.name:8}{job_str}", "
|
|
124
|
+
print(colored(f"{job.state.name:8}{job_str}", "yellow"), end="")
|
|
125
|
+
elif job.state == JobState.DONE:
|
|
126
|
+
print(colored(f"DONE {job_str}", "green"), end="")
|
|
127
|
+
elif job.state == JobState.ERROR:
|
|
128
|
+
print(colored(f"FAIL {job_str}", "red"), end="")
|
|
129
|
+
else:
|
|
130
|
+
print(colored(f"{job.state.name:8}{job_str}", "red"), end="")
|
|
129
131
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
# Show tags if requested (from tags_map)
|
|
133
|
+
if tags:
|
|
134
|
+
job_tags = tags_map.get(job.identifier, {})
|
|
135
|
+
if job_tags:
|
|
136
|
+
print(f""" {" ".join(f"{k}={v}" for k, v in job_tags.items())}""")
|
|
133
137
|
elif not (kill and perform):
|
|
134
138
|
print()
|
|
139
|
+
elif not (kill and perform):
|
|
140
|
+
print()
|
|
135
141
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
else:
|
|
142
|
-
cprint(" Clean failed", "red")
|
|
142
|
+
# Clean finished jobs
|
|
143
|
+
if clean and job.state and job.state.finished():
|
|
144
|
+
if perform:
|
|
145
|
+
if provider.clean_job(job, perform=True):
|
|
146
|
+
cprint(" Cleaned", "red")
|
|
143
147
|
else:
|
|
144
|
-
cprint("
|
|
145
|
-
|
|
146
|
-
|
|
148
|
+
cprint(" Clean failed", "red")
|
|
149
|
+
else:
|
|
150
|
+
cprint(" Would clean (dry run)", "yellow")
|
|
147
151
|
|
|
148
|
-
|
|
149
|
-
# Close provider if we created it for write mode
|
|
150
|
-
if not read_only:
|
|
151
|
-
provider.close()
|
|
152
|
+
print()
|
|
152
153
|
|
|
153
154
|
|
|
154
155
|
@click.option("--experiment", default=None, help="Restrict to this experiment")
|
|
@@ -250,7 +251,7 @@ def log(ctx, jobid: str, follow: bool, std: bool):
|
|
|
250
251
|
/ "jobs"
|
|
251
252
|
/ task_name
|
|
252
253
|
/ task_hash
|
|
253
|
-
/ f"""{name}.{
|
|
254
|
+
/ f"""{name}.{"out" if std else "err"}"""
|
|
254
255
|
)
|
|
255
256
|
if not log_path.exists():
|
|
256
257
|
cprint(f"Log file not found: {log_path}", "red")
|
|
@@ -284,17 +285,15 @@ def cleanup_partials(ctx, perform: bool):
|
|
|
284
285
|
"""Clean up orphan partial directories
|
|
285
286
|
|
|
286
287
|
Partial directories are shared checkpoint locations created by
|
|
287
|
-
|
|
288
|
+
partial. When all jobs using a partial are deleted, the
|
|
288
289
|
partial becomes orphaned and can be cleaned up.
|
|
289
290
|
|
|
290
291
|
This command finds all orphan partials and deletes them (or shows
|
|
291
292
|
what would be deleted in dry-run mode).
|
|
292
293
|
"""
|
|
293
|
-
from experimaestro.scheduler.
|
|
294
|
+
from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
|
|
294
295
|
|
|
295
|
-
provider = WorkspaceStateProvider.get_instance(
|
|
296
|
-
ctx.obj.workspace.path, read_only=not perform
|
|
297
|
-
)
|
|
296
|
+
provider = WorkspaceStateProvider.get_instance(ctx.obj.workspace.path)
|
|
298
297
|
|
|
299
298
|
try:
|
|
300
299
|
orphan_paths = provider.cleanup_orphan_partials(perform=perform)
|
|
@@ -317,3 +316,186 @@ def cleanup_partials(ctx, perform: bool):
|
|
|
317
316
|
finally:
|
|
318
317
|
if perform:
|
|
319
318
|
provider.close()
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@click.option(
|
|
322
|
+
"--kill", is_flag=True, help="Kill running stray jobs (requires --perform)"
|
|
323
|
+
)
|
|
324
|
+
@click.option(
|
|
325
|
+
"--delete", is_flag=True, help="Delete non-running stray jobs (requires --perform)"
|
|
326
|
+
)
|
|
327
|
+
@click.option("--perform", is_flag=True, help="Actually perform the operation")
|
|
328
|
+
@click.option(
|
|
329
|
+
"--force",
|
|
330
|
+
is_flag=True,
|
|
331
|
+
help="Bypass safety checks (e.g., when scheduler is running)",
|
|
332
|
+
)
|
|
333
|
+
@click.option("--size", is_flag=True, help="Show size of each job folder")
|
|
334
|
+
@click.option("--fullpath", is_flag=True, help="Show full paths")
|
|
335
|
+
@jobs.command()
|
|
336
|
+
@click.pass_context
|
|
337
|
+
def stray(
|
|
338
|
+
ctx,
|
|
339
|
+
kill: bool,
|
|
340
|
+
delete: bool,
|
|
341
|
+
perform: bool,
|
|
342
|
+
force: bool,
|
|
343
|
+
size: bool,
|
|
344
|
+
fullpath: bool,
|
|
345
|
+
):
|
|
346
|
+
"""Manage stray jobs (jobs not associated with any experiment)
|
|
347
|
+
|
|
348
|
+
Stray jobs are jobs that exist on disk but are not referenced by any
|
|
349
|
+
experiment. This can happen when:
|
|
350
|
+
|
|
351
|
+
\b
|
|
352
|
+
- An experiment plan changes and a job is no longer needed
|
|
353
|
+
- An experiment is deleted but jobs remain on disk
|
|
354
|
+
- Jobs are manually created outside of experiments
|
|
355
|
+
|
|
356
|
+
Safety: By default, this command will warn if an experiment appears to be
|
|
357
|
+
running (scheduler active). Use --force to bypass this check.
|
|
358
|
+
|
|
359
|
+
Examples:
|
|
360
|
+
|
|
361
|
+
\b
|
|
362
|
+
# List all stray jobs
|
|
363
|
+
experimaestro jobs stray
|
|
364
|
+
|
|
365
|
+
\b
|
|
366
|
+
# List stray jobs with sizes
|
|
367
|
+
experimaestro jobs stray --size
|
|
368
|
+
|
|
369
|
+
\b
|
|
370
|
+
# Kill running stray jobs (dry run)
|
|
371
|
+
experimaestro jobs stray --kill
|
|
372
|
+
|
|
373
|
+
\b
|
|
374
|
+
# Kill running stray jobs (for real)
|
|
375
|
+
experimaestro jobs stray --kill --perform
|
|
376
|
+
|
|
377
|
+
\b
|
|
378
|
+
# Delete non-running stray jobs
|
|
379
|
+
experimaestro jobs stray --delete --perform
|
|
380
|
+
|
|
381
|
+
\b
|
|
382
|
+
# Kill and delete all stray jobs (dangerous!)
|
|
383
|
+
experimaestro jobs stray --kill --delete --perform --force
|
|
384
|
+
"""
|
|
385
|
+
from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
|
|
386
|
+
from experimaestro.scheduler import JobState
|
|
387
|
+
|
|
388
|
+
provider = WorkspaceStateProvider.get_instance(ctx.obj.workspace.path)
|
|
389
|
+
|
|
390
|
+
# Safety check: warn if scheduler appears to be running
|
|
391
|
+
if provider.is_live and not force:
|
|
392
|
+
cprint(
|
|
393
|
+
"Warning: Scheduler appears to be running. Stray detection may be inaccurate.",
|
|
394
|
+
"yellow",
|
|
395
|
+
)
|
|
396
|
+
cprint("Use --force to proceed anyway.", "yellow")
|
|
397
|
+
if perform:
|
|
398
|
+
cprint("Aborting due to active scheduler.", "red")
|
|
399
|
+
return
|
|
400
|
+
|
|
401
|
+
# Get stray jobs (running orphans) and all orphan jobs
|
|
402
|
+
stray_jobs = provider.get_stray_jobs()
|
|
403
|
+
stray_jobs = [j for j in stray_jobs if j.path and j.path.exists()]
|
|
404
|
+
|
|
405
|
+
orphan_jobs = provider.get_orphan_jobs()
|
|
406
|
+
orphan_jobs = [j for j in orphan_jobs if j.path and j.path.exists()]
|
|
407
|
+
|
|
408
|
+
# Finished orphans = orphans that are not stray (not running)
|
|
409
|
+
stray_ids = {j.identifier for j in stray_jobs}
|
|
410
|
+
finished_jobs = [j for j in orphan_jobs if j.identifier not in stray_ids]
|
|
411
|
+
|
|
412
|
+
if not stray_jobs and not finished_jobs:
|
|
413
|
+
cprint("No stray or orphan jobs found.", "green")
|
|
414
|
+
return
|
|
415
|
+
|
|
416
|
+
# Print summary
|
|
417
|
+
print(
|
|
418
|
+
f"Found {len(stray_jobs)} stray (running) and {len(finished_jobs)} orphan (finished) jobs:"
|
|
419
|
+
)
|
|
420
|
+
if stray_jobs:
|
|
421
|
+
cprint(f" {len(stray_jobs)} stray (running)", "yellow")
|
|
422
|
+
if finished_jobs:
|
|
423
|
+
cprint(f" {len(finished_jobs)} orphan (finished)", "cyan")
|
|
424
|
+
print()
|
|
425
|
+
|
|
426
|
+
# Combine for display (stray first, then finished orphans)
|
|
427
|
+
all_jobs = stray_jobs + finished_jobs
|
|
428
|
+
|
|
429
|
+
# Process each job
|
|
430
|
+
killed_count = 0
|
|
431
|
+
deleted_count = 0
|
|
432
|
+
|
|
433
|
+
for job in all_jobs:
|
|
434
|
+
job_str = str(job.path) if fullpath else f"{job.task_id}/{job.identifier}"
|
|
435
|
+
state_name = job.state.name if job.state else "UNKNOWN"
|
|
436
|
+
|
|
437
|
+
# Determine color based on state
|
|
438
|
+
if job.state and job.state.running():
|
|
439
|
+
state_color = "yellow"
|
|
440
|
+
elif job.state == JobState.DONE:
|
|
441
|
+
state_color = "green"
|
|
442
|
+
elif job.state == JobState.ERROR:
|
|
443
|
+
state_color = "red"
|
|
444
|
+
else:
|
|
445
|
+
state_color = "white"
|
|
446
|
+
|
|
447
|
+
# Show job info
|
|
448
|
+
print(colored(f"{state_name:10}{job_str}", state_color), end="")
|
|
449
|
+
|
|
450
|
+
# Show size if requested
|
|
451
|
+
if size and job.path and job.path.exists():
|
|
452
|
+
try:
|
|
453
|
+
result = subprocess.run(
|
|
454
|
+
["du", "-hs", str(job.path)],
|
|
455
|
+
capture_output=True,
|
|
456
|
+
text=True,
|
|
457
|
+
timeout=10,
|
|
458
|
+
)
|
|
459
|
+
if result.returncode == 0:
|
|
460
|
+
size_str = result.stdout.strip().split()[0]
|
|
461
|
+
print(f" [{size_str}]", end="")
|
|
462
|
+
except (subprocess.TimeoutExpired, Exception):
|
|
463
|
+
print(" [?]", end="")
|
|
464
|
+
|
|
465
|
+
print()
|
|
466
|
+
|
|
467
|
+
# Kill running jobs if requested
|
|
468
|
+
if kill and job.state and job.state.running():
|
|
469
|
+
if perform:
|
|
470
|
+
if provider.kill_job(job, perform=True):
|
|
471
|
+
cprint(" KILLED", "light_red")
|
|
472
|
+
killed_count += 1
|
|
473
|
+
else:
|
|
474
|
+
cprint(" KILL FAILED", "red")
|
|
475
|
+
else:
|
|
476
|
+
cprint(" Would kill (dry run)", "yellow")
|
|
477
|
+
|
|
478
|
+
# Delete non-running jobs if requested
|
|
479
|
+
if delete and (not job.state or not job.state.running()):
|
|
480
|
+
if perform:
|
|
481
|
+
success, msg = provider.delete_job_safely(job, cascade_orphans=False)
|
|
482
|
+
if success:
|
|
483
|
+
cprint(" DELETED", "light_red")
|
|
484
|
+
deleted_count += 1
|
|
485
|
+
else:
|
|
486
|
+
cprint(f" DELETE FAILED: {msg}", "red")
|
|
487
|
+
else:
|
|
488
|
+
cprint(" Would delete (dry run)", "yellow")
|
|
489
|
+
|
|
490
|
+
# Summary
|
|
491
|
+
print()
|
|
492
|
+
if perform:
|
|
493
|
+
if kill and killed_count > 0:
|
|
494
|
+
cprint(f"Killed {killed_count} running job(s)", "green")
|
|
495
|
+
if delete and deleted_count > 0:
|
|
496
|
+
cprint(f"Deleted {deleted_count} job(s)", "green")
|
|
497
|
+
# Clean up orphan partials after deleting jobs
|
|
498
|
+
provider.cleanup_orphan_partials(perform=True)
|
|
499
|
+
else:
|
|
500
|
+
if kill or delete:
|
|
501
|
+
cprint("Dry run - no changes made. Use --perform to execute.", "yellow")
|
experimaestro/cli/refactor.py
CHANGED
|
@@ -153,8 +153,7 @@ def refactor_file(file_path: Path, perform: bool) -> int:
|
|
|
153
153
|
# Multi-line value - more complex handling needed
|
|
154
154
|
# For now, just report it
|
|
155
155
|
cprint(
|
|
156
|
-
f" {file_path}:{line_num}: {class_name}.{param_name} has multi-line default "
|
|
157
|
-
f"(manual fix required)",
|
|
156
|
+
f" {file_path}:{line_num}: {class_name}.{param_name} has multi-line default (manual fix required)",
|
|
158
157
|
"red",
|
|
159
158
|
)
|
|
160
159
|
changes_made += 1
|
experimaestro/commandline.py
CHANGED
|
@@ -233,6 +233,7 @@ class CommandLineJob(Job):
|
|
|
233
233
|
launcher=None,
|
|
234
234
|
run_mode: RunMode = None,
|
|
235
235
|
max_retries=None,
|
|
236
|
+
transient=None,
|
|
236
237
|
):
|
|
237
238
|
super().__init__(
|
|
238
239
|
parameters,
|
|
@@ -240,6 +241,7 @@ class CommandLineJob(Job):
|
|
|
240
241
|
launcher=launcher,
|
|
241
242
|
run_mode=run_mode,
|
|
242
243
|
max_retries=max_retries,
|
|
244
|
+
transient=transient,
|
|
243
245
|
)
|
|
244
246
|
self.commandline = commandline
|
|
245
247
|
|
|
@@ -305,11 +307,10 @@ class CommandLineJob(Job):
|
|
|
305
307
|
self._process = processbuilder.start(True)
|
|
306
308
|
|
|
307
309
|
with self.pidpath.open("w") as fp:
|
|
308
|
-
|
|
309
|
-
json.dump(process_spec, fp)
|
|
310
|
+
json.dump(self._process.tospec(), fp)
|
|
310
311
|
|
|
311
|
-
# Write
|
|
312
|
-
self.
|
|
312
|
+
# Write status with process info
|
|
313
|
+
self.status_path.write_text(json.dumps(self.state_dict()))
|
|
313
314
|
|
|
314
315
|
self.state = JobState.RUNNING
|
|
315
316
|
logger.info("Process started (%s)", self._process)
|
|
@@ -328,6 +329,7 @@ class CommandLineTask:
|
|
|
328
329
|
workspace=None,
|
|
329
330
|
run_mode=None,
|
|
330
331
|
max_retries=None,
|
|
332
|
+
transient=None,
|
|
331
333
|
) -> Job:
|
|
332
334
|
return CommandLineJob(
|
|
333
335
|
self.commandline,
|
|
@@ -336,4 +338,5 @@ class CommandLineTask:
|
|
|
336
338
|
workspace=workspace,
|
|
337
339
|
run_mode=run_mode,
|
|
338
340
|
max_retries=max_retries,
|
|
341
|
+
transient=transient,
|
|
339
342
|
)
|
|
@@ -117,6 +117,14 @@ class Process:
|
|
|
117
117
|
"""Wait until the process finishes and returns the error code"""
|
|
118
118
|
raise NotImplementedError(f"Not implemented: {self.__class__}.wait")
|
|
119
119
|
|
|
120
|
+
async def aio_wait(self) -> int:
|
|
121
|
+
"""Asynchronously wait until the process finishes and returns the error code.
|
|
122
|
+
|
|
123
|
+
Subclasses should override this with a truly async implementation.
|
|
124
|
+
Default implementation uses asyncThreadcheck to run wait() in a thread.
|
|
125
|
+
"""
|
|
126
|
+
return await asyncThreadcheck("aio_wait", self.wait)
|
|
127
|
+
|
|
120
128
|
async def aio_state(self, timeout: float | None = None) -> ProcessState:
|
|
121
129
|
"""Returns the job state
|
|
122
130
|
|
|
@@ -134,7 +142,7 @@ class Process:
|
|
|
134
142
|
Returns None if the process has already finished – and no information is
|
|
135
143
|
known about the process.
|
|
136
144
|
"""
|
|
137
|
-
code = await
|
|
145
|
+
code = await self.aio_wait()
|
|
138
146
|
logger.debug("Got return code %s for %s", code, self)
|
|
139
147
|
return code
|
|
140
148
|
|