experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -5,12 +5,27 @@ tags, and other attributes.
5
5
  """
6
6
 
7
7
  import re
8
- from typing import Callable, TYPE_CHECKING
8
+ from dataclasses import dataclass, field
9
+ from typing import Callable, Dict, TYPE_CHECKING
9
10
  import pyparsing as pp
10
11
 
11
12
  if TYPE_CHECKING:
12
13
  from experimaestro.scheduler.state_provider import MockJob
13
14
 
15
+ # Type alias for tags map: job_id -> {tag_key: tag_value}
16
+ TagsMap = Dict[str, Dict[str, str]]
17
+
18
+
19
+ @dataclass
20
+ class FilterContext:
21
+ """Context for filter evaluation containing experiment-scoped data
22
+
23
+ Attributes:
24
+ tags_map: Maps job identifiers to their tags dict for the current experiment/run
25
+ """
26
+
27
+ tags_map: TagsMap = field(default_factory=dict)
28
+
14
29
 
15
30
  # --- classes for processing
16
31
 
@@ -19,14 +34,16 @@ class VarExpr:
19
34
  def __init__(self, values):
20
35
  (self.varname,) = values
21
36
 
22
- def get(self, job: "MockJob"):
37
+ def get(self, job: "MockJob", ctx: FilterContext):
23
38
  if self.varname == "@state":
24
39
  return job.state.name if job.state else None
25
40
 
26
41
  if self.varname == "@name":
27
42
  return str(job.path.parent.name)
28
43
 
29
- return job.tags.get(self.varname, None)
44
+ # Tags are stored in JobTagModel, accessed via ctx.tags_map keyed by job identifier
45
+ job_tags = ctx.tags_map.get(job.identifier, {})
46
+ return job_tags.get(self.varname, None)
30
47
 
31
48
  def __repr__(self):
32
49
  return f"""VAR<{self.varname}>"""
@@ -39,8 +56,8 @@ class BaseInExpr:
39
56
 
40
57
 
41
58
  class InExpr(BaseInExpr):
42
- def filter(self, job: "MockJob"):
43
- value = self.var.get(job)
59
+ def filter(self, job: "MockJob", ctx: FilterContext):
60
+ value = self.var.get(job, ctx)
44
61
  return value in self.values
45
62
 
46
63
  def __repr__(self):
@@ -48,8 +65,8 @@ class InExpr(BaseInExpr):
48
65
 
49
66
 
50
67
  class NotInExpr(BaseInExpr):
51
- def filter(self, job: "MockJob"):
52
- value = self.var.get(job)
68
+ def filter(self, job: "MockJob", ctx: FilterContext):
69
+ value = self.var.get(job, ctx)
53
70
  return value not in self.values
54
71
 
55
72
  def __repr__(self):
@@ -70,8 +87,8 @@ class RegexExpr:
70
87
 
71
88
  raise AssertionError()
72
89
 
73
- def filter(self, job: "MockJob"):
74
- value = self.var.get(job)
90
+ def filter(self, job: "MockJob", ctx: FilterContext):
91
+ value = self.var.get(job, ctx)
75
92
  if not value:
76
93
  return False
77
94
 
@@ -82,7 +99,7 @@ class ConstantString:
82
99
  def __init__(self, tokens):
83
100
  (self.value,) = tokens
84
101
 
85
- def get(self, _job: "MockJob"):
102
+ def get(self, _job: "MockJob", _ctx: FilterContext):
86
103
  return self.value
87
104
 
88
105
  def __repr__(self):
@@ -96,8 +113,8 @@ class EqExpr:
96
113
  def __repr__(self):
97
114
  return f"""EQ[{self.var1}, {self.var2}]"""
98
115
 
99
- def filter(self, job: "MockJob"):
100
- return self.var1.get(job) == self.var2.get(job)
116
+ def filter(self, job: "MockJob", ctx: FilterContext):
117
+ return self.var1.get(job, ctx) == self.var2.get(job, ctx)
101
118
 
102
119
 
103
120
  class LogicExpr:
@@ -107,11 +124,11 @@ class LogicExpr:
107
124
  self.operator, self.y = tokens
108
125
  self.x = None
109
126
 
110
- def filter(self, job: "MockJob"):
127
+ def filter(self, job: "MockJob", ctx: FilterContext):
111
128
  if self.operator == "and":
112
- return self.y.filter(job) and self.x.filter(job)
129
+ return self.y.filter(job, ctx) and self.x.filter(job, ctx)
113
130
 
114
- return self.y.filter(job) or self.x.filter(job)
131
+ return self.y.filter(job, ctx) or self.x.filter(job, ctx)
115
132
 
116
133
  @staticmethod
117
134
  def summary(tokens):
@@ -138,7 +155,7 @@ class LogicExpr:
138
155
 
139
156
  # --- Grammar
140
157
 
141
- l = pp.Literal
158
+ lit = pp.Literal
142
159
 
143
160
  lpar, rpar, lbra, rbra, eq, comma, pipe, tilde = map(pp.Suppress, "()[]=,|~")
144
161
  quotedString = pp.QuotedString('"', unquoteResults=True) | pp.QuotedString(
@@ -148,7 +165,7 @@ quotedString = pp.QuotedString('"', unquoteResults=True) | pp.QuotedString(
148
165
  # Tag names can contain letters, digits, underscores, and hyphens
149
166
  # First character must be a letter, rest can include digits, underscores, hyphens
150
167
  tag_name = pp.Word(pp.alphas, pp.alphanums + "_-")
151
- var = l("@state") | l("@name") | tag_name
168
+ var = lit("@state") | lit("@name") | tag_name
152
169
  var.setParseAction(VarExpr)
153
170
 
154
171
  regexExpr = var + tilde + quotedString
@@ -161,15 +178,15 @@ eqExpr.setParseAction(EqExpr)
161
178
 
162
179
  stringList = quotedString + pp.ZeroOrMore(comma + quotedString)
163
180
 
164
- notInExpr = var + (pp.Suppress(l("not in")) + lbra + stringList + rbra)
181
+ notInExpr = var + (pp.Suppress(lit("not in")) + lbra + stringList + rbra)
165
182
  notInExpr.setParseAction(NotInExpr)
166
183
 
167
- inExpr = var + (pp.Suppress(l("in")) + lbra + stringList + rbra)
184
+ inExpr = var + (pp.Suppress(lit("in")) + lbra + stringList + rbra)
168
185
  inExpr.setParseAction(InExpr)
169
186
 
170
187
  matchExpr = eqExpr | regexExpr | inExpr | notInExpr
171
188
 
172
- booleanOp = l("and") | l("or")
189
+ booleanOp = lit("and") | lit("or")
173
190
  logicExpr = (
174
191
  matchExpr + pp.ZeroOrMore((booleanOp + matchExpr).setParseAction(LogicExpr))
175
192
  ).setParseAction(LogicExpr.summary)
@@ -181,14 +198,22 @@ filterExpr = (
181
198
  expr = (matchExpr + pp.Optional(pipe + filterExpr)).setParseAction(LogicExpr.generator)
182
199
 
183
200
 
184
- def createFilter(query: str) -> Callable[["MockJob"], bool]:
201
+ def createFilter(query: str, ctx: FilterContext = None) -> Callable[["MockJob"], bool]:
185
202
  """Returns a filter function given a query string
186
203
 
187
204
  Args:
188
205
  query: Filter expression (e.g., '@state = "DONE" and model = "bm25"')
206
+ ctx: FilterContext containing tags map and other experiment-scoped data.
207
+ If None, an empty context is used.
189
208
 
190
209
  Returns:
191
- A callable that takes a MockJob and returns True if it matches
210
+ A callable that takes a MockJob and returns True if it matches.
192
211
  """
212
+ if ctx is None:
213
+ ctx = FilterContext()
193
214
  (r,) = logicExpr.parseString(query, parseAll=True)
194
- return r.filter
215
+
216
+ def filter_fn(job: "MockJob") -> bool:
217
+ return r.filter(job, ctx)
218
+
219
+ return filter_fn
experimaestro/cli/jobs.py CHANGED
@@ -65,90 +65,91 @@ def process(
65
65
  fullpath: Show full paths instead of short names
66
66
  count: Limit output to N most recent jobs (0 = no limit)
67
67
  """
68
- from .filter import createFilter
69
- from experimaestro.scheduler.state_provider import WorkspaceStateProvider
68
+ from .filter import createFilter, FilterContext
69
+ from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
70
70
  from experimaestro.scheduler import JobState
71
71
 
72
- _filter = createFilter(filter) if filter else None
72
+ # Get state provider (read-only monitoring)
73
+ provider = WorkspaceStateProvider.get_instance(workspace.path)
73
74
 
74
- # Get state provider (write mode for kill/clean operations)
75
- read_only = not (kill or clean)
76
- provider = WorkspaceStateProvider.get_instance(workspace.path, read_only=read_only)
77
-
78
- try:
79
- # Get all jobs from the database
75
+ # Get jobs from the database, optionally filtered by experiment
76
+ if experiment:
77
+ all_jobs = provider.get_jobs(experiment_id=experiment)
78
+ else:
80
79
  all_jobs = provider.get_all_jobs()
81
80
 
82
- # Filter by experiment if specified
83
- if experiment:
84
- all_jobs = [j for j in all_jobs if j.experiment_id == experiment]
81
+ # Load tags map for the experiment (if specified)
82
+ tags_map = {}
83
+ if experiment:
84
+ tags_map = provider.get_tags_map(experiment_id=experiment)
85
85
 
86
- # Apply filter expression
87
- if _filter:
88
- all_jobs = [j for j in all_jobs if _filter(j)]
86
+ # Create filter context with tags map
87
+ filter_ctx = FilterContext(tags_map=tags_map)
89
88
 
90
- # Sort by submission time (most recent first)
91
- # Jobs without submittime go to the end
92
- all_jobs.sort(key=lambda j: j.submittime or 0, reverse=True)
89
+ # Create filter function with context
90
+ _filter = createFilter(filter, filter_ctx) if filter else None
93
91
 
94
- # Limit to N most recent jobs if count is specified
95
- if count > 0:
96
- all_jobs = all_jobs[:count]
92
+ # Apply filter expression
93
+ if _filter:
94
+ all_jobs = [j for j in all_jobs if _filter(j)]
97
95
 
98
- if not all_jobs:
99
- cprint("No jobs found.", "yellow")
100
- return
96
+ # Sort by submission time (most recent first)
97
+ # Jobs without submittime go to the end
98
+ all_jobs.sort(key=lambda j: j.submittime or 0, reverse=True)
99
+
100
+ # Limit to N most recent jobs if count is specified
101
+ if count > 0:
102
+ all_jobs = all_jobs[:count]
101
103
 
102
- # Process each job
103
- for job in all_jobs:
104
- job_str = str(job.path) if fullpath else f"{job.task_id}/{job.identifier}"
105
-
106
- # Add experiment info
107
- if job.experiment_id:
108
- job_str += f" [{job.experiment_id}]"
109
-
110
- if job.state is None or job.state == JobState.UNSCHEDULED:
111
- print(colored(f"UNSCHED {job_str}", "red"), end="")
112
- elif job.state.running():
113
- if kill:
114
- if perform:
115
- if provider.kill_job(job, perform=True):
116
- cprint(f"KILLED {job_str}", "light_red")
117
- else:
118
- cprint(f"KILL FAILED {job_str}", "red")
104
+ if not all_jobs:
105
+ cprint("No jobs found.", "yellow")
106
+ return
107
+
108
+ # Process each job
109
+ for job in all_jobs:
110
+ job_str = str(job.path) if fullpath else f"{job.task_id}/{job.identifier}"
111
+
112
+ if job.state is None or job.state == JobState.UNSCHEDULED:
113
+ print(colored(f"UNSCHED {job_str}", "red"), end="")
114
+ elif job.state.running():
115
+ if kill:
116
+ if perform:
117
+ if provider.kill_job(job, perform=True):
118
+ cprint(f"KILLED {job_str}", "light_red")
119
119
  else:
120
- cprint(f"KILLING {job_str} (dry run)", "yellow")
120
+ cprint(f"KILL FAILED {job_str}", "red")
121
121
  else:
122
- print(colored(f"{job.state.name:8}{job_str}", "yellow"), end="")
123
- elif job.state == JobState.DONE:
124
- print(colored(f"DONE {job_str}", "green"), end="")
125
- elif job.state == JobState.ERROR:
126
- print(colored(f"FAIL {job_str}", "red"), end="")
122
+ cprint(f"KILLING {job_str} (dry run)", "yellow")
127
123
  else:
128
- print(colored(f"{job.state.name:8}{job_str}", "red"), end="")
124
+ print(colored(f"{job.state.name:8}{job_str}", "yellow"), end="")
125
+ elif job.state == JobState.DONE:
126
+ print(colored(f"DONE {job_str}", "green"), end="")
127
+ elif job.state == JobState.ERROR:
128
+ print(colored(f"FAIL {job_str}", "red"), end="")
129
+ else:
130
+ print(colored(f"{job.state.name:8}{job_str}", "red"), end="")
129
131
 
130
- # Show tags if requested
131
- if tags and job.tags:
132
- print(f""" {" ".join(f"{k}={v}" for k, v in job.tags.items())}""")
132
+ # Show tags if requested (from tags_map)
133
+ if tags:
134
+ job_tags = tags_map.get(job.identifier, {})
135
+ if job_tags:
136
+ print(f""" {" ".join(f"{k}={v}" for k, v in job_tags.items())}""")
133
137
  elif not (kill and perform):
134
138
  print()
139
+ elif not (kill and perform):
140
+ print()
135
141
 
136
- # Clean finished jobs
137
- if clean and job.state and job.state.finished():
138
- if perform:
139
- if provider.clean_job(job, perform=True):
140
- cprint(" Cleaned", "red")
141
- else:
142
- cprint(" Clean failed", "red")
142
+ # Clean finished jobs
143
+ if clean and job.state and job.state.finished():
144
+ if perform:
145
+ if provider.clean_job(job, perform=True):
146
+ cprint(" Cleaned", "red")
143
147
  else:
144
- cprint(" Would clean (dry run)", "yellow")
145
-
146
- print()
148
+ cprint(" Clean failed", "red")
149
+ else:
150
+ cprint(" Would clean (dry run)", "yellow")
147
151
 
148
- finally:
149
- # Close provider if we created it for write mode
150
- if not read_only:
151
- provider.close()
152
+ print()
152
153
 
153
154
 
154
155
  @click.option("--experiment", default=None, help="Restrict to this experiment")
@@ -250,7 +251,7 @@ def log(ctx, jobid: str, follow: bool, std: bool):
250
251
  / "jobs"
251
252
  / task_name
252
253
  / task_hash
253
- / f"""{name}.{'out' if std else 'err'}"""
254
+ / f"""{name}.{"out" if std else "err"}"""
254
255
  )
255
256
  if not log_path.exists():
256
257
  cprint(f"Log file not found: {log_path}", "red")
@@ -284,17 +285,15 @@ def cleanup_partials(ctx, perform: bool):
284
285
  """Clean up orphan partial directories
285
286
 
286
287
  Partial directories are shared checkpoint locations created by
287
- subparameters. When all jobs using a partial are deleted, the
288
+ partial. When all jobs using a partial are deleted, the
288
289
  partial becomes orphaned and can be cleaned up.
289
290
 
290
291
  This command finds all orphan partials and deletes them (or shows
291
292
  what would be deleted in dry-run mode).
292
293
  """
293
- from experimaestro.scheduler.state_provider import WorkspaceStateProvider
294
+ from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
294
295
 
295
- provider = WorkspaceStateProvider.get_instance(
296
- ctx.obj.workspace.path, read_only=not perform
297
- )
296
+ provider = WorkspaceStateProvider.get_instance(ctx.obj.workspace.path)
298
297
 
299
298
  try:
300
299
  orphan_paths = provider.cleanup_orphan_partials(perform=perform)
@@ -317,3 +316,186 @@ def cleanup_partials(ctx, perform: bool):
317
316
  finally:
318
317
  if perform:
319
318
  provider.close()
319
+
320
+
321
+ @click.option(
322
+ "--kill", is_flag=True, help="Kill running stray jobs (requires --perform)"
323
+ )
324
+ @click.option(
325
+ "--delete", is_flag=True, help="Delete non-running stray jobs (requires --perform)"
326
+ )
327
+ @click.option("--perform", is_flag=True, help="Actually perform the operation")
328
+ @click.option(
329
+ "--force",
330
+ is_flag=True,
331
+ help="Bypass safety checks (e.g., when scheduler is running)",
332
+ )
333
+ @click.option("--size", is_flag=True, help="Show size of each job folder")
334
+ @click.option("--fullpath", is_flag=True, help="Show full paths")
335
+ @jobs.command()
336
+ @click.pass_context
337
+ def stray(
338
+ ctx,
339
+ kill: bool,
340
+ delete: bool,
341
+ perform: bool,
342
+ force: bool,
343
+ size: bool,
344
+ fullpath: bool,
345
+ ):
346
+ """Manage stray jobs (jobs not associated with any experiment)
347
+
348
+ Stray jobs are jobs that exist on disk but are not referenced by any
349
+ experiment. This can happen when:
350
+
351
+ \b
352
+ - An experiment plan changes and a job is no longer needed
353
+ - An experiment is deleted but jobs remain on disk
354
+ - Jobs are manually created outside of experiments
355
+
356
+ Safety: By default, this command will warn if an experiment appears to be
357
+ running (scheduler active). Use --force to bypass this check.
358
+
359
+ Examples:
360
+
361
+ \b
362
+ # List all stray jobs
363
+ experimaestro jobs stray
364
+
365
+ \b
366
+ # List stray jobs with sizes
367
+ experimaestro jobs stray --size
368
+
369
+ \b
370
+ # Kill running stray jobs (dry run)
371
+ experimaestro jobs stray --kill
372
+
373
+ \b
374
+ # Kill running stray jobs (for real)
375
+ experimaestro jobs stray --kill --perform
376
+
377
+ \b
378
+ # Delete non-running stray jobs
379
+ experimaestro jobs stray --delete --perform
380
+
381
+ \b
382
+ # Kill and delete all stray jobs (dangerous!)
383
+ experimaestro jobs stray --kill --delete --perform --force
384
+ """
385
+ from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
386
+ from experimaestro.scheduler import JobState
387
+
388
+ provider = WorkspaceStateProvider.get_instance(ctx.obj.workspace.path)
389
+
390
+ # Safety check: warn if scheduler appears to be running
391
+ if provider.is_live and not force:
392
+ cprint(
393
+ "Warning: Scheduler appears to be running. Stray detection may be inaccurate.",
394
+ "yellow",
395
+ )
396
+ cprint("Use --force to proceed anyway.", "yellow")
397
+ if perform:
398
+ cprint("Aborting due to active scheduler.", "red")
399
+ return
400
+
401
+ # Get stray jobs (running orphans) and all orphan jobs
402
+ stray_jobs = provider.get_stray_jobs()
403
+ stray_jobs = [j for j in stray_jobs if j.path and j.path.exists()]
404
+
405
+ orphan_jobs = provider.get_orphan_jobs()
406
+ orphan_jobs = [j for j in orphan_jobs if j.path and j.path.exists()]
407
+
408
+ # Finished orphans = orphans that are not stray (not running)
409
+ stray_ids = {j.identifier for j in stray_jobs}
410
+ finished_jobs = [j for j in orphan_jobs if j.identifier not in stray_ids]
411
+
412
+ if not stray_jobs and not finished_jobs:
413
+ cprint("No stray or orphan jobs found.", "green")
414
+ return
415
+
416
+ # Print summary
417
+ print(
418
+ f"Found {len(stray_jobs)} stray (running) and {len(finished_jobs)} orphan (finished) jobs:"
419
+ )
420
+ if stray_jobs:
421
+ cprint(f" {len(stray_jobs)} stray (running)", "yellow")
422
+ if finished_jobs:
423
+ cprint(f" {len(finished_jobs)} orphan (finished)", "cyan")
424
+ print()
425
+
426
+ # Combine for display (stray first, then finished orphans)
427
+ all_jobs = stray_jobs + finished_jobs
428
+
429
+ # Process each job
430
+ killed_count = 0
431
+ deleted_count = 0
432
+
433
+ for job in all_jobs:
434
+ job_str = str(job.path) if fullpath else f"{job.task_id}/{job.identifier}"
435
+ state_name = job.state.name if job.state else "UNKNOWN"
436
+
437
+ # Determine color based on state
438
+ if job.state and job.state.running():
439
+ state_color = "yellow"
440
+ elif job.state == JobState.DONE:
441
+ state_color = "green"
442
+ elif job.state == JobState.ERROR:
443
+ state_color = "red"
444
+ else:
445
+ state_color = "white"
446
+
447
+ # Show job info
448
+ print(colored(f"{state_name:10}{job_str}", state_color), end="")
449
+
450
+ # Show size if requested
451
+ if size and job.path and job.path.exists():
452
+ try:
453
+ result = subprocess.run(
454
+ ["du", "-hs", str(job.path)],
455
+ capture_output=True,
456
+ text=True,
457
+ timeout=10,
458
+ )
459
+ if result.returncode == 0:
460
+ size_str = result.stdout.strip().split()[0]
461
+ print(f" [{size_str}]", end="")
462
+ except (subprocess.TimeoutExpired, Exception):
463
+ print(" [?]", end="")
464
+
465
+ print()
466
+
467
+ # Kill running jobs if requested
468
+ if kill and job.state and job.state.running():
469
+ if perform:
470
+ if provider.kill_job(job, perform=True):
471
+ cprint(" KILLED", "light_red")
472
+ killed_count += 1
473
+ else:
474
+ cprint(" KILL FAILED", "red")
475
+ else:
476
+ cprint(" Would kill (dry run)", "yellow")
477
+
478
+ # Delete non-running jobs if requested
479
+ if delete and (not job.state or not job.state.running()):
480
+ if perform:
481
+ success, msg = provider.delete_job_safely(job, cascade_orphans=False)
482
+ if success:
483
+ cprint(" DELETED", "light_red")
484
+ deleted_count += 1
485
+ else:
486
+ cprint(f" DELETE FAILED: {msg}", "red")
487
+ else:
488
+ cprint(" Would delete (dry run)", "yellow")
489
+
490
+ # Summary
491
+ print()
492
+ if perform:
493
+ if kill and killed_count > 0:
494
+ cprint(f"Killed {killed_count} running job(s)", "green")
495
+ if delete and deleted_count > 0:
496
+ cprint(f"Deleted {deleted_count} job(s)", "green")
497
+ # Clean up orphan partials after deleting jobs
498
+ provider.cleanup_orphan_partials(perform=True)
499
+ else:
500
+ if kill or delete:
501
+ cprint("Dry run - no changes made. Use --perform to execute.", "yellow")
@@ -153,8 +153,7 @@ def refactor_file(file_path: Path, perform: bool) -> int:
153
153
  # Multi-line value - more complex handling needed
154
154
  # For now, just report it
155
155
  cprint(
156
- f" {file_path}:{line_num}: {class_name}.{param_name} has multi-line default "
157
- f"(manual fix required)",
156
+ f" {file_path}:{line_num}: {class_name}.{param_name} has multi-line default (manual fix required)",
158
157
  "red",
159
158
  )
160
159
  changes_made += 1
@@ -233,6 +233,7 @@ class CommandLineJob(Job):
233
233
  launcher=None,
234
234
  run_mode: RunMode = None,
235
235
  max_retries=None,
236
+ transient=None,
236
237
  ):
237
238
  super().__init__(
238
239
  parameters,
@@ -240,6 +241,7 @@ class CommandLineJob(Job):
240
241
  launcher=launcher,
241
242
  run_mode=run_mode,
242
243
  max_retries=max_retries,
244
+ transient=transient,
243
245
  )
244
246
  self.commandline = commandline
245
247
 
@@ -305,11 +307,10 @@ class CommandLineJob(Job):
305
307
  self._process = processbuilder.start(True)
306
308
 
307
309
  with self.pidpath.open("w") as fp:
308
- process_spec = self._process.tospec()
309
- json.dump(process_spec, fp)
310
+ json.dump(self._process.tospec(), fp)
310
311
 
311
- # Write process spec to metadata (contains launcher type, job ID, etc.)
312
- self.write_metadata(process=process_spec)
312
+ # Write status with process info
313
+ self.status_path.write_text(json.dumps(self.state_dict()))
313
314
 
314
315
  self.state = JobState.RUNNING
315
316
  logger.info("Process started (%s)", self._process)
@@ -328,6 +329,7 @@ class CommandLineTask:
328
329
  workspace=None,
329
330
  run_mode=None,
330
331
  max_retries=None,
332
+ transient=None,
331
333
  ) -> Job:
332
334
  return CommandLineJob(
333
335
  self.commandline,
@@ -336,4 +338,5 @@ class CommandLineTask:
336
338
  workspace=workspace,
337
339
  run_mode=run_mode,
338
340
  max_retries=max_retries,
341
+ transient=transient,
339
342
  )
@@ -117,6 +117,14 @@ class Process:
117
117
  """Wait until the process finishes and returns the error code"""
118
118
  raise NotImplementedError(f"Not implemented: {self.__class__}.wait")
119
119
 
120
+ async def aio_wait(self) -> int:
121
+ """Asynchronously wait until the process finishes and returns the error code.
122
+
123
+ Subclasses should override this with a truly async implementation.
124
+ Default implementation uses asyncThreadcheck to run wait() in a thread.
125
+ """
126
+ return await asyncThreadcheck("aio_wait", self.wait)
127
+
120
128
  async def aio_state(self, timeout: float | None = None) -> ProcessState:
121
129
  """Returns the job state
122
130
 
@@ -134,7 +142,7 @@ class Process:
134
142
  Returns None if the process has already finished – and no information is
135
143
  known about the process.
136
144
  """
137
- code = await asyncThreadcheck("aio_code", self.wait)
145
+ code = await self.aio_wait()
138
146
  logger.debug("Got return code %s for %s", code, self)
139
147
  return code
140
148