experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,15 +1,11 @@
|
|
|
1
1
|
"""Dependency between tasks and tokens"""
|
|
2
2
|
|
|
3
3
|
import threading
|
|
4
|
-
from typing import
|
|
5
|
-
import
|
|
4
|
+
from typing import Set
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from ..utils import logger
|
|
8
7
|
from ..locking import Lock
|
|
9
8
|
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from . import Job
|
|
12
|
-
|
|
13
9
|
|
|
14
10
|
class Dependents:
|
|
15
11
|
"""Encapsulate the access to the dependents"""
|
|
@@ -47,32 +43,51 @@ class DependencyStatus(Enum):
|
|
|
47
43
|
"""Dependency won't be availabe in the foreseeable future"""
|
|
48
44
|
|
|
49
45
|
|
|
50
|
-
class Dependency:
|
|
51
|
-
|
|
52
|
-
|
|
46
|
+
class Dependency(ABC):
|
|
47
|
+
"""Base class for dependencies
|
|
48
|
+
|
|
49
|
+
Static dependencies (like jobs) have a fixed state once resolved - they cannot
|
|
50
|
+
go from DONE back to WAIT. This is the default behavior.
|
|
51
|
+
"""
|
|
53
52
|
|
|
54
53
|
def __init__(self, origin):
|
|
55
|
-
# Origin
|
|
54
|
+
# Origin is the resource this dependency points to
|
|
56
55
|
self.origin = origin
|
|
57
|
-
|
|
58
|
-
self.
|
|
56
|
+
# Target will be set by scheduler when registering the job
|
|
57
|
+
self.target = None
|
|
58
|
+
|
|
59
|
+
def is_dynamic(self) -> bool:
|
|
60
|
+
"""Returns True if this is a dynamic dependency (can change state)"""
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
async def aio_lock(self, timeout: float = 0) -> Lock:
|
|
65
|
+
"""Acquire a lock on this dependency asynchronously
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
timeout: Timeout in seconds (0 = wait indefinitely)
|
|
59
69
|
|
|
60
|
-
|
|
61
|
-
|
|
70
|
+
Returns:
|
|
71
|
+
Lock object
|
|
62
72
|
|
|
63
|
-
|
|
64
|
-
|
|
73
|
+
Raises:
|
|
74
|
+
LockError: If lock cannot be acquired within timeout
|
|
75
|
+
RuntimeError: If dependency failed
|
|
76
|
+
"""
|
|
77
|
+
pass
|
|
65
78
|
|
|
66
79
|
def __repr__(self) -> str:
|
|
67
|
-
return "Dep[{origin}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
80
|
+
return f"Dep[{self.origin}]"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class DynamicDependency(Dependency):
|
|
84
|
+
"""Base class for dynamic dependencies
|
|
85
|
+
|
|
86
|
+
Dynamic dependencies (like tokens) can change state at any time - availability
|
|
87
|
+
can go from OK to WAIT and back. These require special handling during lock
|
|
88
|
+
acquisition with retry logic.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def is_dynamic(self) -> bool:
|
|
92
|
+
"""Returns True - this is a dynamic dependency"""
|
|
93
|
+
return True
|
|
@@ -1,184 +1,313 @@
|
|
|
1
|
-
"""Handles dynamic task outputs
|
|
1
|
+
"""Handles dynamic task outputs
|
|
2
|
+
|
|
3
|
+
This module provides support for tasks that produce dynamic outputs during
|
|
4
|
+
execution. These outputs can trigger callbacks that submit new tasks.
|
|
5
|
+
|
|
6
|
+
Key concepts:
|
|
7
|
+
- TaskOutputs: Monitors a task's output file for events
|
|
8
|
+
- TaskOutputsWorker: Processes events and calls registered callbacks
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import asyncio
|
|
4
12
|
import json
|
|
5
|
-
import logging
|
|
6
13
|
import queue
|
|
7
14
|
import threading
|
|
8
|
-
from collections import defaultdict
|
|
9
|
-
from functools import cached_property
|
|
10
15
|
from pathlib import Path
|
|
11
|
-
from typing import Callable, TYPE_CHECKING
|
|
16
|
+
from typing import Callable, Dict, List, Set, TYPE_CHECKING
|
|
12
17
|
|
|
13
18
|
from watchdog.events import FileSystemEventHandler
|
|
14
19
|
|
|
15
20
|
from experimaestro.ipc import ipcom
|
|
16
21
|
from experimaestro.utils import logger
|
|
17
22
|
|
|
18
|
-
from .base import Job, experiment
|
|
19
|
-
|
|
20
23
|
if TYPE_CHECKING:
|
|
21
24
|
from experimaestro.core.objects import WatchedOutput
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
from experimaestro.scheduler.experiment import experiment
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TaskOutputWatcher:
|
|
29
|
+
"""Watches a specific output method for a configuration within a job"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
key: str,
|
|
34
|
+
method: Callable,
|
|
35
|
+
worker: "TaskOutputsWorker",
|
|
36
|
+
):
|
|
37
|
+
self.key = key
|
|
38
|
+
self.method = method
|
|
39
|
+
self.worker = worker
|
|
40
|
+
self.callbacks: Set[Callable] = set()
|
|
41
|
+
self.processed_events: List[dict] = []
|
|
42
|
+
|
|
43
|
+
def add_callback(self, callback: Callable):
|
|
44
|
+
"""Add a callback and replay any existing events"""
|
|
45
|
+
# Replay processed events to new callback (don't update count for replays)
|
|
46
|
+
for event in self.processed_events:
|
|
47
|
+
self.worker.add(callback, event, update_count=False)
|
|
48
|
+
self.callbacks.add(callback)
|
|
49
|
+
|
|
50
|
+
def process_event(self, raw_event: dict):
|
|
51
|
+
"""Process a raw event from the task output file"""
|
|
52
|
+
# Call the method to convert the raw event to a configuration
|
|
53
|
+
try:
|
|
54
|
+
# The method signature is: method(dep, *args, **kwargs) -> Config
|
|
55
|
+
# We need to provide a marker function that marks the output
|
|
56
|
+
def mark_output(config):
|
|
57
|
+
"""Marker function that just returns the config"""
|
|
58
|
+
return config
|
|
59
|
+
|
|
60
|
+
result = self.method(mark_output, *raw_event["args"], **raw_event["kwargs"])
|
|
61
|
+
self.processed_events.append(result)
|
|
62
|
+
|
|
63
|
+
# Dispatch to all callbacks
|
|
64
|
+
for callback in self.callbacks:
|
|
65
|
+
self.worker.add(callback, result)
|
|
66
|
+
except Exception:
|
|
67
|
+
logger.exception("Error processing task output event")
|
|
27
68
|
|
|
28
69
|
|
|
29
70
|
class TaskOutputs(FileSystemEventHandler):
|
|
30
|
-
"""
|
|
71
|
+
"""Monitors dynamic outputs generated by one task"""
|
|
31
72
|
|
|
32
|
-
#: Global dictionary
|
|
33
|
-
HANDLERS:
|
|
73
|
+
#: Global dictionary mapping paths to TaskOutputs instances
|
|
74
|
+
HANDLERS: Dict[Path, "TaskOutputs"] = {}
|
|
34
75
|
|
|
35
|
-
#: Global lock
|
|
76
|
+
#: Global lock for accessing HANDLERS
|
|
36
77
|
LOCK = threading.Lock()
|
|
37
78
|
|
|
38
|
-
|
|
79
|
+
@staticmethod
|
|
80
|
+
def get_or_create(path: Path, worker: "TaskOutputsWorker") -> "TaskOutputs":
|
|
81
|
+
"""Get or create a TaskOutputs instance for the given path"""
|
|
39
82
|
with TaskOutputs.LOCK:
|
|
40
|
-
if
|
|
83
|
+
if path in TaskOutputs.HANDLERS:
|
|
84
|
+
instance = TaskOutputs.HANDLERS[path]
|
|
85
|
+
# Update worker reference in case this is a new experiment
|
|
86
|
+
instance.worker = worker
|
|
87
|
+
# Clear old watchers - new ones will be added and replay events
|
|
88
|
+
instance.watchers.clear()
|
|
41
89
|
return instance
|
|
42
90
|
|
|
43
|
-
instance = TaskOutputs(
|
|
44
|
-
TaskOutputs[
|
|
91
|
+
instance = TaskOutputs(path, worker)
|
|
92
|
+
TaskOutputs.HANDLERS[path] = instance
|
|
45
93
|
return instance
|
|
46
94
|
|
|
47
|
-
def __init__(self, path: Path):
|
|
48
|
-
"""
|
|
49
|
-
|
|
95
|
+
def __init__(self, path: Path, worker: "TaskOutputsWorker"):
|
|
96
|
+
"""Initialize monitoring for a task output path"""
|
|
97
|
+
super().__init__()
|
|
98
|
+
logger.debug("Creating TaskOutputs monitor for %s", path)
|
|
50
99
|
self.path = path
|
|
51
|
-
self.
|
|
52
|
-
self.
|
|
53
|
-
self.
|
|
54
|
-
self.
|
|
55
|
-
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
self.
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
100
|
+
self.worker = worker
|
|
101
|
+
self._watch_handle = None
|
|
102
|
+
self._file_handle = None
|
|
103
|
+
self._lock = threading.Lock()
|
|
104
|
+
|
|
105
|
+
# Map from key (config_id/method_name) to TaskOutputWatcher
|
|
106
|
+
self.watchers: Dict[str, TaskOutputWatcher] = {}
|
|
107
|
+
|
|
108
|
+
def start_watching(self):
|
|
109
|
+
"""Start watching the task output file"""
|
|
110
|
+
logger.debug("Starting to watch task outputs at %s", self.path)
|
|
111
|
+
with self._lock:
|
|
112
|
+
if self._watch_handle is not None:
|
|
113
|
+
return # Already watching
|
|
114
|
+
|
|
115
|
+
# Ensure the directory exists
|
|
116
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
|
|
118
|
+
# Start file system watching
|
|
119
|
+
self._watch_handle = ipcom().fswatch(self, self.path.parent, False)
|
|
120
|
+
logger.debug("Started watching directory %s", self.path.parent)
|
|
121
|
+
|
|
122
|
+
# Process any existing content
|
|
123
|
+
self._process_file()
|
|
124
|
+
|
|
125
|
+
def stop_watching(self):
|
|
126
|
+
"""Stop watching the task output file"""
|
|
127
|
+
with self._lock:
|
|
128
|
+
if self._watch_handle is not None:
|
|
129
|
+
try:
|
|
130
|
+
ipcom().fsunwatch(self._watch_handle)
|
|
131
|
+
except KeyError:
|
|
132
|
+
pass # Already unwatched
|
|
133
|
+
self._watch_handle = None
|
|
134
|
+
|
|
135
|
+
if self._file_handle is not None:
|
|
136
|
+
self._file_handle.close()
|
|
137
|
+
self._file_handle = None
|
|
138
|
+
|
|
139
|
+
def add_watcher(self, watched: "WatchedOutput"):
|
|
140
|
+
"""Add a watcher for a specific output method"""
|
|
141
|
+
# Use the identifier from the config - watched.config is actually a Config object
|
|
142
|
+
# (method.__self__), not a ConfigInformation, despite the type annotation
|
|
143
|
+
config_id = watched.config.__xpm__.identifier.all.hex()
|
|
144
|
+
key = f"{config_id}/{watched.method_name}"
|
|
145
|
+
logger.debug("Adding watcher for key: %s", key)
|
|
146
|
+
|
|
147
|
+
with self._lock:
|
|
148
|
+
is_new = key not in self.watchers
|
|
149
|
+
if is_new:
|
|
150
|
+
self.watchers[key] = TaskOutputWatcher(key, watched.method, self.worker)
|
|
151
|
+
|
|
152
|
+
# If this is a new watcher and the file already exists, replay events from file
|
|
153
|
+
if is_new and self.path.exists():
|
|
154
|
+
self._replay_events_for_key(key)
|
|
155
|
+
|
|
156
|
+
self.watchers[key].add_callback(watched.callback)
|
|
157
|
+
|
|
158
|
+
def _replay_events_for_key(self, key: str):
|
|
159
|
+
"""Replay events from the file for a specific key"""
|
|
160
|
+
if not self.path.exists():
|
|
161
|
+
return
|
|
103
162
|
|
|
104
|
-
|
|
105
|
-
|
|
163
|
+
with self.path.open("rt") as f:
|
|
164
|
+
for line in f:
|
|
165
|
+
line = line.strip()
|
|
166
|
+
if not line:
|
|
167
|
+
continue
|
|
106
168
|
|
|
107
|
-
|
|
108
|
-
|
|
169
|
+
try:
|
|
170
|
+
event = json.loads(line)
|
|
171
|
+
if event.get("key") == key:
|
|
172
|
+
self.watchers[key].process_event(event)
|
|
173
|
+
except json.JSONDecodeError:
|
|
174
|
+
logger.warning("Invalid JSON in task output: %s", line)
|
|
175
|
+
except Exception:
|
|
176
|
+
logger.exception("Error processing task output line")
|
|
109
177
|
|
|
110
|
-
def
|
|
111
|
-
|
|
178
|
+
def _process_file(self):
|
|
179
|
+
"""Process the task output file"""
|
|
180
|
+
if not self.path.exists():
|
|
112
181
|
return
|
|
113
182
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
183
|
+
if self._file_handle is None:
|
|
184
|
+
self._file_handle = self.path.open("rt")
|
|
185
|
+
|
|
186
|
+
while line := self._file_handle.readline():
|
|
187
|
+
line = line.strip()
|
|
188
|
+
if not line:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
event = json.loads(line)
|
|
193
|
+
key = event.get("key")
|
|
194
|
+
if key and key in self.watchers:
|
|
195
|
+
self.watchers[key].process_event(event)
|
|
196
|
+
except json.JSONDecodeError:
|
|
197
|
+
logger.warning("Invalid JSON in task output: %s", line)
|
|
198
|
+
except Exception:
|
|
199
|
+
logger.exception("Error processing task output line")
|
|
200
|
+
|
|
201
|
+
# FileSystemEventHandler methods
|
|
202
|
+
def on_modified(self, event):
|
|
203
|
+
if Path(event.src_path) == self.path:
|
|
204
|
+
with self._lock:
|
|
205
|
+
self._process_file()
|
|
129
206
|
|
|
130
|
-
|
|
131
|
-
|
|
207
|
+
def on_created(self, event):
|
|
208
|
+
if Path(event.src_path) == self.path:
|
|
209
|
+
with self._lock:
|
|
210
|
+
self._process_file()
|
|
132
211
|
|
|
133
212
|
|
|
134
213
|
class TaskOutputsWorker(threading.Thread):
|
|
135
|
-
"""
|
|
214
|
+
"""Worker thread that processes task output callbacks"""
|
|
136
215
|
|
|
137
|
-
def __init__(self, xp: experiment):
|
|
138
|
-
super().__init__(name="task
|
|
139
|
-
self.queue = queue.Queue()
|
|
216
|
+
def __init__(self, xp: "experiment"):
|
|
217
|
+
super().__init__(name="task-outputs-worker", daemon=True)
|
|
218
|
+
self.queue: queue.Queue = queue.Queue()
|
|
140
219
|
self.xp = xp
|
|
220
|
+
self._monitors: Dict[Path, TaskOutputs] = {}
|
|
221
|
+
self._lock = threading.Lock()
|
|
141
222
|
|
|
142
223
|
def watch_output(self, watched: "WatchedOutput"):
|
|
143
|
-
"""
|
|
224
|
+
"""Register a watched output
|
|
144
225
|
|
|
145
226
|
:param watched: The watched output specification
|
|
146
227
|
"""
|
|
147
|
-
|
|
228
|
+
# Get the job's task output path
|
|
229
|
+
job = watched.job
|
|
230
|
+
if job is None:
|
|
231
|
+
logger.warning("Cannot watch output without job: %s", watched)
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
path = job.task_outputs_path
|
|
235
|
+
logger.debug("Registering task output listener at %s", path)
|
|
236
|
+
|
|
237
|
+
with self._lock:
|
|
238
|
+
if path not in self._monitors:
|
|
239
|
+
monitor = TaskOutputs.get_or_create(path, self)
|
|
240
|
+
self._monitors[path] = monitor
|
|
241
|
+
monitor.start_watching()
|
|
242
|
+
else:
|
|
243
|
+
monitor = self._monitors[path]
|
|
244
|
+
|
|
245
|
+
monitor.add_watcher(watched)
|
|
246
|
+
|
|
247
|
+
def add(self, callback: Callable, event, update_count: bool = True):
|
|
248
|
+
"""Add an event to the processing queue
|
|
148
249
|
|
|
149
|
-
|
|
150
|
-
|
|
250
|
+
:param callback: The callback to call with the event
|
|
251
|
+
:param event: The event data
|
|
252
|
+
:param update_count: Whether to update the task output count (False for replays)
|
|
253
|
+
"""
|
|
254
|
+
if update_count:
|
|
255
|
+
asyncio.run_coroutine_threadsafe(
|
|
256
|
+
self.xp.update_task_output_count(1),
|
|
257
|
+
self.xp.scheduler.loop,
|
|
258
|
+
).result()
|
|
151
259
|
|
|
152
|
-
|
|
153
|
-
asyncio.run_coroutine_threadsafe(
|
|
154
|
-
self.xp.update_task_output_count(1),
|
|
155
|
-
self.xp.scheduler.loop,
|
|
156
|
-
).result()
|
|
157
|
-
self.queue.put((watcher, event))
|
|
260
|
+
self.queue.put((callback, event, update_count))
|
|
158
261
|
|
|
159
262
|
def run(self):
|
|
160
|
-
|
|
263
|
+
"""Main worker loop"""
|
|
264
|
+
logger.debug("Starting task outputs worker")
|
|
161
265
|
while True:
|
|
162
|
-
# Get the next element in the queue
|
|
163
266
|
element = self.queue.get()
|
|
164
267
|
if element is None:
|
|
165
|
-
#
|
|
268
|
+
# Shutdown signal
|
|
166
269
|
break
|
|
167
270
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
logger.debug(
|
|
176
|
-
"[done] Calling listener [%s] with %s", listener, event
|
|
177
|
-
)
|
|
178
|
-
except Exception:
|
|
179
|
-
logging.exception("Exception while calling the listener")
|
|
271
|
+
callback, event, update_count = element
|
|
272
|
+
try:
|
|
273
|
+
logger.debug("Calling callback %s with event %s", callback, event)
|
|
274
|
+
callback(event)
|
|
275
|
+
except Exception:
|
|
276
|
+
logger.exception("Error in task output callback")
|
|
277
|
+
finally:
|
|
180
278
|
self.queue.task_done()
|
|
279
|
+
if update_count:
|
|
280
|
+
asyncio.run_coroutine_threadsafe(
|
|
281
|
+
self.xp.update_task_output_count(-1),
|
|
282
|
+
self.xp.scheduler.loop,
|
|
283
|
+
).result()
|
|
284
|
+
|
|
285
|
+
logger.debug("Task outputs worker stopped")
|
|
181
286
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
287
|
+
def process_job_outputs(self, job) -> None:
|
|
288
|
+
"""Explicitly process any remaining task outputs for a completed job.
|
|
289
|
+
|
|
290
|
+
This is called when a job finishes to ensure all task outputs written
|
|
291
|
+
by the job are processed before the experiment considers exiting.
|
|
292
|
+
This is necessary because file system watchers may have latency.
|
|
293
|
+
|
|
294
|
+
:param job: The job that has finished
|
|
295
|
+
"""
|
|
296
|
+
path = job.task_outputs_path
|
|
297
|
+
with self._lock:
|
|
298
|
+
monitor = self._monitors.get(path)
|
|
299
|
+
|
|
300
|
+
if monitor is not None:
|
|
301
|
+
with monitor._lock:
|
|
302
|
+
monitor._process_file()
|
|
303
|
+
|
|
304
|
+
def shutdown(self):
|
|
305
|
+
"""Stop the worker and all monitors"""
|
|
306
|
+
# Stop all monitors
|
|
307
|
+
with self._lock:
|
|
308
|
+
for monitor in self._monitors.values():
|
|
309
|
+
monitor.stop_watching()
|
|
310
|
+
self._monitors.clear()
|
|
311
|
+
|
|
312
|
+
# Signal the worker to stop
|
|
313
|
+
self.queue.put(None)
|