experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +140 -16
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/progress.py +269 -0
  7. experimaestro/cli/refactor.py +249 -0
  8. experimaestro/click.py +0 -1
  9. experimaestro/commandline.py +19 -3
  10. experimaestro/connectors/__init__.py +22 -3
  11. experimaestro/connectors/local.py +12 -0
  12. experimaestro/core/arguments.py +192 -37
  13. experimaestro/core/identifier.py +127 -12
  14. experimaestro/core/objects/__init__.py +6 -0
  15. experimaestro/core/objects/config.py +702 -285
  16. experimaestro/core/objects/config_walk.py +24 -6
  17. experimaestro/core/serialization.py +91 -34
  18. experimaestro/core/serializers.py +1 -8
  19. experimaestro/core/subparameters.py +164 -0
  20. experimaestro/core/types.py +198 -83
  21. experimaestro/exceptions.py +26 -0
  22. experimaestro/experiments/cli.py +107 -25
  23. experimaestro/generators.py +50 -9
  24. experimaestro/huggingface.py +3 -1
  25. experimaestro/launcherfinder/parser.py +29 -0
  26. experimaestro/launcherfinder/registry.py +3 -3
  27. experimaestro/launchers/__init__.py +26 -1
  28. experimaestro/launchers/direct.py +12 -0
  29. experimaestro/launchers/slurm/base.py +154 -2
  30. experimaestro/mkdocs/base.py +6 -8
  31. experimaestro/mkdocs/metaloader.py +0 -1
  32. experimaestro/mypy.py +452 -7
  33. experimaestro/notifications.py +75 -16
  34. experimaestro/progress.py +404 -0
  35. experimaestro/rpyc.py +0 -1
  36. experimaestro/run.py +19 -6
  37. experimaestro/scheduler/__init__.py +18 -1
  38. experimaestro/scheduler/base.py +504 -959
  39. experimaestro/scheduler/dependencies.py +43 -28
  40. experimaestro/scheduler/dynamic_outputs.py +259 -130
  41. experimaestro/scheduler/experiment.py +582 -0
  42. experimaestro/scheduler/interfaces.py +474 -0
  43. experimaestro/scheduler/jobs.py +485 -0
  44. experimaestro/scheduler/services.py +186 -12
  45. experimaestro/scheduler/signal_handler.py +32 -0
  46. experimaestro/scheduler/state.py +1 -1
  47. experimaestro/scheduler/state_db.py +388 -0
  48. experimaestro/scheduler/state_provider.py +2345 -0
  49. experimaestro/scheduler/state_sync.py +834 -0
  50. experimaestro/scheduler/workspace.py +52 -10
  51. experimaestro/scriptbuilder.py +7 -0
  52. experimaestro/server/__init__.py +153 -32
  53. experimaestro/server/data/index.css +0 -125
  54. experimaestro/server/data/index.css.map +1 -1
  55. experimaestro/server/data/index.js +194 -58
  56. experimaestro/server/data/index.js.map +1 -1
  57. experimaestro/settings.py +47 -6
  58. experimaestro/sphinx/__init__.py +3 -3
  59. experimaestro/taskglobals.py +20 -0
  60. experimaestro/tests/conftest.py +80 -0
  61. experimaestro/tests/core/test_generics.py +2 -2
  62. experimaestro/tests/identifier_stability.json +45 -0
  63. experimaestro/tests/launchers/bin/sacct +6 -2
  64. experimaestro/tests/launchers/bin/sbatch +4 -2
  65. experimaestro/tests/launchers/common.py +2 -2
  66. experimaestro/tests/launchers/test_slurm.py +80 -0
  67. experimaestro/tests/restart.py +1 -1
  68. experimaestro/tests/tasks/all.py +7 -0
  69. experimaestro/tests/tasks/test_dynamic.py +231 -0
  70. experimaestro/tests/test_checkers.py +2 -2
  71. experimaestro/tests/test_cli_jobs.py +615 -0
  72. experimaestro/tests/test_dependencies.py +11 -17
  73. experimaestro/tests/test_deprecated.py +630 -0
  74. experimaestro/tests/test_environment.py +200 -0
  75. experimaestro/tests/test_experiment.py +3 -3
  76. experimaestro/tests/test_file_progress.py +425 -0
  77. experimaestro/tests/test_file_progress_integration.py +477 -0
  78. experimaestro/tests/test_forward.py +3 -3
  79. experimaestro/tests/test_generators.py +93 -0
  80. experimaestro/tests/test_identifier.py +520 -169
  81. experimaestro/tests/test_identifier_stability.py +458 -0
  82. experimaestro/tests/test_instance.py +16 -21
  83. experimaestro/tests/test_multitoken.py +442 -0
  84. experimaestro/tests/test_mypy.py +433 -0
  85. experimaestro/tests/test_objects.py +314 -30
  86. experimaestro/tests/test_outputs.py +8 -8
  87. experimaestro/tests/test_param.py +22 -26
  88. experimaestro/tests/test_partial_paths.py +231 -0
  89. experimaestro/tests/test_progress.py +2 -50
  90. experimaestro/tests/test_resumable_task.py +480 -0
  91. experimaestro/tests/test_serializers.py +141 -60
  92. experimaestro/tests/test_state_db.py +434 -0
  93. experimaestro/tests/test_subparameters.py +160 -0
  94. experimaestro/tests/test_tags.py +151 -15
  95. experimaestro/tests/test_tasks.py +137 -160
  96. experimaestro/tests/test_token_locking.py +252 -0
  97. experimaestro/tests/test_tokens.py +25 -19
  98. experimaestro/tests/test_types.py +133 -11
  99. experimaestro/tests/test_validation.py +19 -19
  100. experimaestro/tests/test_workspace_triggers.py +158 -0
  101. experimaestro/tests/token_reschedule.py +5 -3
  102. experimaestro/tests/utils.py +2 -2
  103. experimaestro/tokens.py +154 -57
  104. experimaestro/tools/diff.py +8 -1
  105. experimaestro/tui/__init__.py +8 -0
  106. experimaestro/tui/app.py +2303 -0
  107. experimaestro/tui/app.tcss +353 -0
  108. experimaestro/tui/log_viewer.py +228 -0
  109. experimaestro/typingutils.py +11 -2
  110. experimaestro/utils/__init__.py +23 -0
  111. experimaestro/utils/environment.py +148 -0
  112. experimaestro/utils/git.py +129 -0
  113. experimaestro/utils/resources.py +1 -1
  114. experimaestro/version.py +34 -0
  115. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
  116. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  117. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  118. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  119. experimaestro/compat.py +0 -6
  120. experimaestro/core/objects.pyi +0 -225
  121. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  122. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  123. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  124. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  125. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  126. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  127. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  128. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  129. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  130. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  131. experimaestro-1.11.1.dist-info/RECORD +0 -158
  132. experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
  133. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
@@ -1,15 +1,11 @@
1
1
  """Dependency between tasks and tokens"""
2
2
 
3
3
  import threading
4
- from typing import Optional, Set, TYPE_CHECKING
5
- import asyncio
4
+ from typing import Set
5
+ from abc import ABC, abstractmethod
6
6
  from enum import Enum
7
- from ..utils import logger
8
7
  from ..locking import Lock
9
8
 
10
- if TYPE_CHECKING:
11
- from . import Job
12
-
13
9
 
14
10
  class Dependents:
15
11
  """Encapsulate the access to the dependents"""
@@ -47,32 +43,51 @@ class DependencyStatus(Enum):
47
43
  """Dependency won't be availabe in the foreseeable future"""
48
44
 
49
45
 
50
- class Dependency:
51
- # Dependency status
52
- loop: asyncio.AbstractEventLoop
46
+ class Dependency(ABC):
47
+ """Base class for dependencies
48
+
49
+ Static dependencies (like jobs) have a fixed state once resolved - they cannot
50
+ go from DONE back to WAIT. This is the default behavior.
51
+ """
53
52
 
54
53
  def __init__(self, origin):
55
- # Origin and target are two resources
54
+ # Origin is the resource this dependency points to
56
55
  self.origin = origin
57
- self.target: Optional["Job"] = None
58
- self.currentstatus = DependencyStatus.WAIT
56
+ # Target will be set by scheduler when registering the job
57
+ self.target = None
58
+
59
+ def is_dynamic(self) -> bool:
60
+ """Returns True if this is a dynamic dependency (can change state)"""
61
+ return False
62
+
63
+ @abstractmethod
64
+ async def aio_lock(self, timeout: float = 0) -> Lock:
65
+ """Acquire a lock on this dependency asynchronously
66
+
67
+ Args:
68
+ timeout: Timeout in seconds (0 = wait indefinitely)
59
69
 
60
- def status(self) -> DependencyStatus:
61
- raise NotImplementedError()
70
+ Returns:
71
+ Lock object
62
72
 
63
- def lock(self) -> Lock:
64
- raise NotImplementedError()
73
+ Raises:
74
+ LockError: If lock cannot be acquired within timeout
75
+ RuntimeError: If dependency failed
76
+ """
77
+ pass
65
78
 
66
79
  def __repr__(self) -> str:
67
- return "Dep[{origin}->{target}]/{currentstatus}".format(**self.__dict__)
68
-
69
- def check(self):
70
- assert self.target is not None
71
- status = self.status()
72
- logger.debug("Dependency check: %s", self)
73
- if status != self.currentstatus:
74
- logger.debug(
75
- "Dependency %s is %s (was: %s)", self, status, self.currentstatus
76
- )
77
- self.target.dependencychanged(self, self.currentstatus, status)
78
- self.currentstatus = status
80
+ return f"Dep[{self.origin}]"
81
+
82
+
83
+ class DynamicDependency(Dependency):
84
+ """Base class for dynamic dependencies
85
+
86
+ Dynamic dependencies (like tokens) can change state at any time - availability
87
+ can go from OK to WAIT and back. These require special handling during lock
88
+ acquisition with retry logic.
89
+ """
90
+
91
+ def is_dynamic(self) -> bool:
92
+ """Returns True - this is a dynamic dependency"""
93
+ return True
@@ -1,184 +1,313 @@
1
- """Handles dynamic task outputs"""
1
+ """Handles dynamic task outputs
2
+
3
+ This module provides support for tasks that produce dynamic outputs during
4
+ execution. These outputs can trigger callbacks that submit new tasks.
5
+
6
+ Key concepts:
7
+ - TaskOutputs: Monitors a task's output file for events
8
+ - TaskOutputsWorker: Processes events and calls registered callbacks
9
+ """
2
10
 
3
11
  import asyncio
4
12
  import json
5
- import logging
6
13
  import queue
7
14
  import threading
8
- from collections import defaultdict
9
- from functools import cached_property
10
15
  from pathlib import Path
11
- from typing import Callable, TYPE_CHECKING
16
+ from typing import Callable, Dict, List, Set, TYPE_CHECKING
12
17
 
13
18
  from watchdog.events import FileSystemEventHandler
14
19
 
15
20
  from experimaestro.ipc import ipcom
16
21
  from experimaestro.utils import logger
17
22
 
18
- from .base import Job, experiment
19
-
20
23
  if TYPE_CHECKING:
21
24
  from experimaestro.core.objects import WatchedOutput
22
-
23
-
24
- class TaskOutputCallbackHandler:
25
- def __init__(self, converter: Callable):
26
- pass
25
+ from experimaestro.scheduler.experiment import experiment
26
+
27
+
28
+ class TaskOutputWatcher:
29
+ """Watches a specific output method for a configuration within a job"""
30
+
31
+ def __init__(
32
+ self,
33
+ key: str,
34
+ method: Callable,
35
+ worker: "TaskOutputsWorker",
36
+ ):
37
+ self.key = key
38
+ self.method = method
39
+ self.worker = worker
40
+ self.callbacks: Set[Callable] = set()
41
+ self.processed_events: List[dict] = []
42
+
43
+ def add_callback(self, callback: Callable):
44
+ """Add a callback and replay any existing events"""
45
+ # Replay processed events to new callback (don't update count for replays)
46
+ for event in self.processed_events:
47
+ self.worker.add(callback, event, update_count=False)
48
+ self.callbacks.add(callback)
49
+
50
+ def process_event(self, raw_event: dict):
51
+ """Process a raw event from the task output file"""
52
+ # Call the method to convert the raw event to a configuration
53
+ try:
54
+ # The method signature is: method(dep, *args, **kwargs) -> Config
55
+ # We need to provide a marker function that marks the output
56
+ def mark_output(config):
57
+ """Marker function that just returns the config"""
58
+ return config
59
+
60
+ result = self.method(mark_output, *raw_event["args"], **raw_event["kwargs"])
61
+ self.processed_events.append(result)
62
+
63
+ # Dispatch to all callbacks
64
+ for callback in self.callbacks:
65
+ self.worker.add(callback, result)
66
+ except Exception:
67
+ logger.exception("Error processing task output event")
27
68
 
28
69
 
29
70
  class TaskOutputs(FileSystemEventHandler):
30
- """Represent and monitors dynamic outputs generated by one task"""
71
+ """Monitors dynamic outputs generated by one task"""
31
72
 
32
- #: Global dictionary for handles
33
- HANDLERS: dict[Path, "TaskOutputs"] = {}
73
+ #: Global dictionary mapping paths to TaskOutputs instances
74
+ HANDLERS: Dict[Path, "TaskOutputs"] = {}
34
75
 
35
- #: Global lock to access current HANDLERS
76
+ #: Global lock for accessing HANDLERS
36
77
  LOCK = threading.Lock()
37
78
 
38
- def create(job: Job):
79
+ @staticmethod
80
+ def get_or_create(path: Path, worker: "TaskOutputsWorker") -> "TaskOutputs":
81
+ """Get or create a TaskOutputs instance for the given path"""
39
82
  with TaskOutputs.LOCK:
40
- if instance := TaskOutputs.get(job.task_outputs_path, None):
83
+ if path in TaskOutputs.HANDLERS:
84
+ instance = TaskOutputs.HANDLERS[path]
85
+ # Update worker reference in case this is a new experiment
86
+ instance.worker = worker
87
+ # Clear old watchers - new ones will be added and replay events
88
+ instance.watchers.clear()
41
89
  return instance
42
90
 
43
- instance = TaskOutputs(job.task_outputs_path)
44
- TaskOutputs[job.task_outputs_path] = instance
91
+ instance = TaskOutputs(path, worker)
92
+ TaskOutputs.HANDLERS[path] = instance
45
93
  return instance
46
94
 
47
- def __init__(self, path: Path):
48
- """Monitors an event path"""
49
- logger.debug("Watching dynamic task outputs in %s", path)
95
+ def __init__(self, path: Path, worker: "TaskOutputsWorker"):
96
+ """Initialize monitoring for a task output path"""
97
+ super().__init__()
98
+ logger.debug("Creating TaskOutputs monitor for %s", path)
50
99
  self.path = path
51
- self.handle = None
52
- self.count = 0
53
- self.lock = threading.Lock()
54
- self.listeners: dict[str, dict[Callable, set[Callable]]] = defaultdict(
55
- lambda: defaultdict(set)
56
- )
57
-
58
- #: The events registered so far
59
- self.events = []
60
-
61
- def __enter__(self):
62
- """Starts monitoring task outputs"""
63
- self.job.task_outputs_path.parent.mkdir(parents=True, exist_ok=True)
64
- with self.lock:
65
- if self.handle is None:
66
- assert self.count == 0
67
- self.handle = ipcom().fswatch(self, self.path.parent, False)
68
- self.count += 1
69
- return self
70
-
71
- def __exit__(self, *args):
72
- """Stops monitoring task outputs"""
73
- with self.lock:
74
- self.count -= 1
75
- if self.count == 0:
76
- ipcom().fsunwatch(self.handle)
77
- self.fh.close()
78
-
79
- self.handle = None
80
- self._fh = None
81
-
82
- def watch_output(self, watched: "WatchedOutput"):
83
- """Add a new listener"""
84
- key = f"{watched.config.__identifier__}/{watched.method_name}"
85
- with self.lock:
86
- # Process events so far
87
- listener = self.listeners[key].get(watched.method, None)
88
- if listener is None:
89
- listener = TaskOutputCallbackHandler(watched.method)
90
-
91
- # Register
92
- self.listeners[key][watched.method].add(watched.callback)
93
-
94
- #
95
- # --- Events
96
- #
97
-
98
- @cached_property
99
- def fh(self):
100
- if self._fh is None:
101
- self._fh = self.path.open("rt")
102
- return self._fh
100
+ self.worker = worker
101
+ self._watch_handle = None
102
+ self._file_handle = None
103
+ self._lock = threading.Lock()
104
+
105
+ # Map from key (config_id/method_name) to TaskOutputWatcher
106
+ self.watchers: Dict[str, TaskOutputWatcher] = {}
107
+
108
+ def start_watching(self):
109
+ """Start watching the task output file"""
110
+ logger.debug("Starting to watch task outputs at %s", self.path)
111
+ with self._lock:
112
+ if self._watch_handle is not None:
113
+ return # Already watching
114
+
115
+ # Ensure the directory exists
116
+ self.path.parent.mkdir(parents=True, exist_ok=True)
117
+
118
+ # Start file system watching
119
+ self._watch_handle = ipcom().fswatch(self, self.path.parent, False)
120
+ logger.debug("Started watching directory %s", self.path.parent)
121
+
122
+ # Process any existing content
123
+ self._process_file()
124
+
125
+ def stop_watching(self):
126
+ """Stop watching the task output file"""
127
+ with self._lock:
128
+ if self._watch_handle is not None:
129
+ try:
130
+ ipcom().fsunwatch(self._watch_handle)
131
+ except KeyError:
132
+ pass # Already unwatched
133
+ self._watch_handle = None
134
+
135
+ if self._file_handle is not None:
136
+ self._file_handle.close()
137
+ self._file_handle = None
138
+
139
+ def add_watcher(self, watched: "WatchedOutput"):
140
+ """Add a watcher for a specific output method"""
141
+ # Use the identifier from the config - watched.config is actually a Config object
142
+ # (method.__self__), not a ConfigInformation, despite the type annotation
143
+ config_id = watched.config.__xpm__.identifier.all.hex()
144
+ key = f"{config_id}/{watched.method_name}"
145
+ logger.debug("Adding watcher for key: %s", key)
146
+
147
+ with self._lock:
148
+ is_new = key not in self.watchers
149
+ if is_new:
150
+ self.watchers[key] = TaskOutputWatcher(key, watched.method, self.worker)
151
+
152
+ # If this is a new watcher and the file already exists, replay events from file
153
+ if is_new and self.path.exists():
154
+ self._replay_events_for_key(key)
155
+
156
+ self.watchers[key].add_callback(watched.callback)
157
+
158
+ def _replay_events_for_key(self, key: str):
159
+ """Replay events from the file for a specific key"""
160
+ if not self.path.exists():
161
+ return
103
162
 
104
- def on_modified(self, event):
105
- self.handle(Path(event.src_path))
163
+ with self.path.open("rt") as f:
164
+ for line in f:
165
+ line = line.strip()
166
+ if not line:
167
+ continue
106
168
 
107
- def on_created(self, event):
108
- self.handle(Path(event.src_path))
169
+ try:
170
+ event = json.loads(line)
171
+ if event.get("key") == key:
172
+ self.watchers[key].process_event(event)
173
+ except json.JSONDecodeError:
174
+ logger.warning("Invalid JSON in task output: %s", line)
175
+ except Exception:
176
+ logger.exception("Error processing task output line")
109
177
 
110
- def handle(self, path: Path):
111
- if path != self.path:
178
+ def _process_file(self):
179
+ """Process the task output file"""
180
+ if not self.path.exists():
112
181
  return
113
182
 
114
- with self.lock:
115
- logger.debug("[TASK OUTPUT] Handling task output for %s", self.path)
116
-
117
- while json_line := self.fh.readline():
118
- # Read the event
119
- event = json.loads(json_line)
120
- logger.debug("Event: %s", event)
121
-
122
- # FIXME: move elsewhere
123
- # # Process the event
124
- # event = self.config_method(
125
- # self.job.config.__xpm__.mark_output,
126
- # *event["args"],
127
- # **event["kwargs"],
128
- # )
183
+ if self._file_handle is None:
184
+ self._file_handle = self.path.open("rt")
185
+
186
+ while line := self._file_handle.readline():
187
+ line = line.strip()
188
+ if not line:
189
+ continue
190
+
191
+ try:
192
+ event = json.loads(line)
193
+ key = event.get("key")
194
+ if key and key in self.watchers:
195
+ self.watchers[key].process_event(event)
196
+ except json.JSONDecodeError:
197
+ logger.warning("Invalid JSON in task output: %s", line)
198
+ except Exception:
199
+ logger.exception("Error processing task output line")
200
+
201
+ # FileSystemEventHandler methods
202
+ def on_modified(self, event):
203
+ if Path(event.src_path) == self.path:
204
+ with self._lock:
205
+ self._process_file()
129
206
 
130
- self.events.append(event)
131
- # self.job.scheduler.xp.taskOutputsWorker.add(self, event)
207
+ def on_created(self, event):
208
+ if Path(event.src_path) == self.path:
209
+ with self._lock:
210
+ self._process_file()
132
211
 
133
212
 
134
213
  class TaskOutputsWorker(threading.Thread):
135
- """This worker process dynamic output queue for one experiment"""
214
+ """Worker thread that processes task output callbacks"""
136
215
 
137
- def __init__(self, xp: experiment):
138
- super().__init__(name="task outputs worker", daemon=True)
139
- self.queue = queue.Queue()
216
+ def __init__(self, xp: "experiment"):
217
+ super().__init__(name="task-outputs-worker", daemon=True)
218
+ self.queue: queue.Queue = queue.Queue()
140
219
  self.xp = xp
220
+ self._monitors: Dict[Path, TaskOutputs] = {}
221
+ self._lock = threading.Lock()
141
222
 
142
223
  def watch_output(self, watched: "WatchedOutput"):
143
- """Watch an output
224
+ """Register a watched output
144
225
 
145
226
  :param watched: The watched output specification
146
227
  """
147
- logger.debug("Registering task output listener %s", watched)
228
+ # Get the job's task output path
229
+ job = watched.job
230
+ if job is None:
231
+ logger.warning("Cannot watch output without job: %s", watched)
232
+ return
233
+
234
+ path = job.task_outputs_path
235
+ logger.debug("Registering task output listener at %s", path)
236
+
237
+ with self._lock:
238
+ if path not in self._monitors:
239
+ monitor = TaskOutputs.get_or_create(path, self)
240
+ self._monitors[path] = monitor
241
+ monitor.start_watching()
242
+ else:
243
+ monitor = self._monitors[path]
244
+
245
+ monitor.add_watcher(watched)
246
+
247
+ def add(self, callback: Callable, event, update_count: bool = True):
248
+ """Add an event to the processing queue
148
249
 
149
- # path = watched.job.tasks_output_path
150
- TaskOutputs.create(watched.job).watch_output(watched)
250
+ :param callback: The callback to call with the event
251
+ :param event: The event data
252
+ :param update_count: Whether to update the task output count (False for replays)
253
+ """
254
+ if update_count:
255
+ asyncio.run_coroutine_threadsafe(
256
+ self.xp.update_task_output_count(1),
257
+ self.xp.scheduler.loop,
258
+ ).result()
151
259
 
152
- def add(self, watcher, event):
153
- asyncio.run_coroutine_threadsafe(
154
- self.xp.update_task_output_count(1),
155
- self.xp.scheduler.loop,
156
- ).result()
157
- self.queue.put((watcher, event))
260
+ self.queue.put((callback, event, update_count))
158
261
 
159
262
  def run(self):
160
- logging.debug("Starting output listener queue")
263
+ """Main worker loop"""
264
+ logger.debug("Starting task outputs worker")
161
265
  while True:
162
- # Get the next element in the queue
163
266
  element = self.queue.get()
164
267
  if element is None:
165
- # end of processing
268
+ # Shutdown signal
166
269
  break
167
270
 
168
- # Call all the listeners
169
- logging.debug("Got one event: %s", element)
170
- watcher, event = element
171
- for listener in watcher.listeners:
172
- try:
173
- logger.debug("Calling listener [%s] with %s", listener, event)
174
- listener(event)
175
- logger.debug(
176
- "[done] Calling listener [%s] with %s", listener, event
177
- )
178
- except Exception:
179
- logging.exception("Exception while calling the listener")
271
+ callback, event, update_count = element
272
+ try:
273
+ logger.debug("Calling callback %s with event %s", callback, event)
274
+ callback(event)
275
+ except Exception:
276
+ logger.exception("Error in task output callback")
277
+ finally:
180
278
  self.queue.task_done()
279
+ if update_count:
280
+ asyncio.run_coroutine_threadsafe(
281
+ self.xp.update_task_output_count(-1),
282
+ self.xp.scheduler.loop,
283
+ ).result()
284
+
285
+ logger.debug("Task outputs worker stopped")
181
286
 
182
- asyncio.run_coroutine_threadsafe(
183
- self.xp.update_task_output_count(-1), self.xp.scheduler.loop
184
- ).result()
287
+ def process_job_outputs(self, job) -> None:
288
+ """Explicitly process any remaining task outputs for a completed job.
289
+
290
+ This is called when a job finishes to ensure all task outputs written
291
+ by the job are processed before the experiment considers exiting.
292
+ This is necessary because file system watchers may have latency.
293
+
294
+ :param job: The job that has finished
295
+ """
296
+ path = job.task_outputs_path
297
+ with self._lock:
298
+ monitor = self._monitors.get(path)
299
+
300
+ if monitor is not None:
301
+ with monitor._lock:
302
+ monitor._process_file()
303
+
304
+ def shutdown(self):
305
+ """Stop the worker and all monitors"""
306
+ # Stop all monitors
307
+ with self._lock:
308
+ for monitor in self._monitors.values():
309
+ monitor.stop_watching()
310
+ self._monitors.clear()
311
+
312
+ # Signal the worker to stop
313
+ self.queue.put(None)