experimaestro 1.6.1__py3-none-any.whl → 1.15.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- experimaestro/__init__.py +14 -3
- experimaestro/annotations.py +13 -3
- experimaestro/cli/filter.py +19 -5
- experimaestro/cli/jobs.py +12 -5
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +27 -12
- experimaestro/connectors/local.py +19 -10
- experimaestro/connectors/ssh.py +1 -1
- experimaestro/core/arguments.py +35 -3
- experimaestro/core/callbacks.py +52 -0
- experimaestro/core/context.py +8 -9
- experimaestro/core/identifier.py +301 -0
- experimaestro/core/objects/__init__.py +44 -0
- experimaestro/core/{objects.py → objects/config.py} +364 -716
- experimaestro/core/objects/config_utils.py +58 -0
- experimaestro/core/objects/config_walk.py +151 -0
- experimaestro/core/objects.pyi +15 -45
- experimaestro/core/serialization.py +63 -9
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/types.py +61 -6
- experimaestro/experiments/cli.py +79 -29
- experimaestro/experiments/configuration.py +3 -0
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/parser.py +8 -3
- experimaestro/launcherfinder/registry.py +29 -10
- experimaestro/launcherfinder/specs.py +49 -10
- experimaestro/launchers/slurm/base.py +51 -13
- experimaestro/mkdocs/__init__.py +1 -1
- experimaestro/notifications.py +2 -1
- experimaestro/run.py +3 -1
- experimaestro/scheduler/base.py +114 -6
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scheduler/state.py +75 -0
- experimaestro/scheduler/workspace.py +2 -1
- experimaestro/scriptbuilder.py +13 -2
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro/server/data/favicon.ico +0 -0
- experimaestro/server/data/index.css +22963 -0
- experimaestro/server/data/index.css.map +1 -0
- experimaestro/server/data/index.html +27 -0
- experimaestro/server/data/index.js +101770 -0
- experimaestro/server/data/index.js.map +1 -0
- experimaestro/server/data/login.html +22 -0
- experimaestro/server/data/manifest.json +15 -0
- experimaestro/settings.py +2 -2
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/core/__init__.py +0 -0
- experimaestro/tests/core/test_generics.py +206 -0
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +16 -4
- experimaestro/tests/restart.py +9 -4
- experimaestro/tests/tasks/all.py +23 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_dependencies.py +0 -6
- experimaestro/tests/test_experiment.py +73 -0
- experimaestro/tests/test_findlauncher.py +11 -4
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +114 -99
- experimaestro/tests/test_instance.py +6 -21
- experimaestro/tests/test_objects.py +20 -4
- experimaestro/tests/test_param.py +60 -22
- experimaestro/tests/test_serializers.py +24 -64
- experimaestro/tests/test_tags.py +5 -11
- experimaestro/tests/test_tasks.py +10 -23
- experimaestro/tests/test_tokens.py +3 -2
- experimaestro/tests/test_types.py +20 -17
- experimaestro/tests/test_validation.py +48 -91
- experimaestro/tokens.py +16 -5
- experimaestro/typingutils.py +8 -8
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/multiprocessing.py +44 -0
- experimaestro/utils/resources.py +7 -3
- {experimaestro-1.6.1.dist-info → experimaestro-1.15.2.dist-info}/METADATA +27 -34
- experimaestro-1.15.2.dist-info/RECORD +159 -0
- {experimaestro-1.6.1.dist-info → experimaestro-1.15.2.dist-info}/WHEEL +1 -1
- experimaestro-1.6.1.dist-info/RECORD +0 -122
- {experimaestro-1.6.1.dist-info → experimaestro-1.15.2.dist-info}/entry_points.txt +0 -0
- {experimaestro-1.6.1.dist-info → experimaestro-1.15.2.dist-info/licenses}/LICENSE +0 -0
experimaestro/run.py
CHANGED
|
@@ -8,6 +8,7 @@ import json
|
|
|
8
8
|
from typing import List
|
|
9
9
|
import fasteners
|
|
10
10
|
from experimaestro.notifications import progress, report_eoj
|
|
11
|
+
from experimaestro.utils.multiprocessing import delayed_shutdown
|
|
11
12
|
from .core.types import ObjectType
|
|
12
13
|
from experimaestro.utils import logger
|
|
13
14
|
from experimaestro.core.objects import ConfigInformation
|
|
@@ -96,6 +97,7 @@ class TaskRunner:
|
|
|
96
97
|
self.failedpath.write_text(str(code))
|
|
97
98
|
self.cleanup()
|
|
98
99
|
logger.info("Exiting")
|
|
100
|
+
delayed_shutdown(60, exit_code=code)
|
|
99
101
|
sys.exit(1)
|
|
100
102
|
|
|
101
103
|
def run(self):
|
|
@@ -140,10 +142,10 @@ class TaskRunner:
|
|
|
140
142
|
run(workdir / "params.json")
|
|
141
143
|
|
|
142
144
|
# ... remove the handlers
|
|
143
|
-
logger.info("Task ended successfully")
|
|
144
145
|
remove_signal_handlers(remove_cleanup=False)
|
|
145
146
|
|
|
146
147
|
# Everything went OK
|
|
148
|
+
logger.info("Task ended successfully")
|
|
147
149
|
sys.exit(0)
|
|
148
150
|
except Exception:
|
|
149
151
|
logger.exception("Got exception while running")
|
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,12 +1,22 @@
|
|
|
1
1
|
from collections import ChainMap
|
|
2
2
|
from functools import cached_property
|
|
3
|
+
import itertools
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from shutil import rmtree
|
|
7
8
|
import threading
|
|
8
9
|
import time
|
|
9
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Any,
|
|
12
|
+
Iterator,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Set,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Union,
|
|
18
|
+
TYPE_CHECKING,
|
|
19
|
+
)
|
|
10
20
|
import enum
|
|
11
21
|
import signal
|
|
12
22
|
import asyncio
|
|
@@ -17,9 +27,10 @@ from experimaestro.scheduler.services import Service
|
|
|
17
27
|
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
18
28
|
|
|
19
29
|
|
|
20
|
-
from experimaestro.core.objects import Config, ConfigWalkContext
|
|
30
|
+
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
21
31
|
from experimaestro.utils import logger
|
|
22
32
|
from experimaestro.locking import Locks, LockError, Lock
|
|
33
|
+
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
23
34
|
from .workspace import RunMode, Workspace
|
|
24
35
|
from .dependencies import Dependency, DependencyStatus, Resource
|
|
25
36
|
import concurrent.futures
|
|
@@ -110,7 +121,7 @@ class JobDependency(Dependency):
|
|
|
110
121
|
|
|
111
122
|
|
|
112
123
|
class Job(Resource):
|
|
113
|
-
"""A job is a
|
|
124
|
+
"""A job is a resource that is produced by the execution of some code"""
|
|
114
125
|
|
|
115
126
|
# Set by the scheduler
|
|
116
127
|
_readyEvent: Optional[asyncio.Event]
|
|
@@ -148,6 +159,11 @@ class Job(Resource):
|
|
|
148
159
|
# Dependencies
|
|
149
160
|
self.dependencies: Set[Dependency] = set() # as target
|
|
150
161
|
|
|
162
|
+
# Watched outputs
|
|
163
|
+
self.watched_outputs = {}
|
|
164
|
+
for watched in config.__xpm__.watched_outputs:
|
|
165
|
+
self.watch_output(watched)
|
|
166
|
+
|
|
151
167
|
# Process
|
|
152
168
|
self._process = None
|
|
153
169
|
self.unsatisfied = 0
|
|
@@ -159,6 +175,23 @@ class Job(Resource):
|
|
|
159
175
|
self._progress: List[LevelInformation] = []
|
|
160
176
|
self.tags = config.tags()
|
|
161
177
|
|
|
178
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
179
|
+
"""Monitor task outputs
|
|
180
|
+
|
|
181
|
+
:param watched: A description of the watched output
|
|
182
|
+
"""
|
|
183
|
+
self.scheduler.xp.watch_output(watched)
|
|
184
|
+
|
|
185
|
+
def task_output_update(self, subpath: Path):
|
|
186
|
+
"""Notification of an updated task output"""
|
|
187
|
+
if watcher := self.watched_outputs.get(subpath, None):
|
|
188
|
+
watcher.update()
|
|
189
|
+
|
|
190
|
+
def done_handler(self):
|
|
191
|
+
"""The task has been completed"""
|
|
192
|
+
for watcher in self.watched_outputs.values():
|
|
193
|
+
watcher.update()
|
|
194
|
+
|
|
162
195
|
def __str__(self):
|
|
163
196
|
return "Job[{}]".format(self.identifier)
|
|
164
197
|
|
|
@@ -166,6 +199,11 @@ class Job(Resource):
|
|
|
166
199
|
assert self._future, "Cannot wait a not submitted job"
|
|
167
200
|
return self._future.result()
|
|
168
201
|
|
|
202
|
+
@cached_property
|
|
203
|
+
def python_path(self) -> Iterator[str]:
|
|
204
|
+
"""Returns an iterator over python path"""
|
|
205
|
+
return itertools.chain(self.workspace.python_path)
|
|
206
|
+
|
|
169
207
|
@cached_property
|
|
170
208
|
def environ(self):
|
|
171
209
|
"""Returns the job environment
|
|
@@ -219,7 +257,7 @@ class Job(Resource):
|
|
|
219
257
|
return self.state == JobState.READY
|
|
220
258
|
|
|
221
259
|
@property
|
|
222
|
-
def jobpath(self):
|
|
260
|
+
def jobpath(self) -> Path:
|
|
223
261
|
"""Deprecated, use `path`"""
|
|
224
262
|
return self.workspace.jobspath / self.relpath
|
|
225
263
|
|
|
@@ -227,6 +265,14 @@ class Job(Resource):
|
|
|
227
265
|
def path(self) -> Path:
|
|
228
266
|
return self.workspace.jobspath / self.relpath
|
|
229
267
|
|
|
268
|
+
@property
|
|
269
|
+
def experimaestro_path(self) -> Path:
|
|
270
|
+
return (self.path / ".experimaestro").resolve()
|
|
271
|
+
|
|
272
|
+
@cached_property
|
|
273
|
+
def task_outputs_path(self) -> Path:
|
|
274
|
+
return self.experimaestro_path / "task-outputs.jsonl"
|
|
275
|
+
|
|
230
276
|
@property
|
|
231
277
|
def relpath(self):
|
|
232
278
|
identifier = self.config.__xpm__.identifier
|
|
@@ -436,7 +482,7 @@ class Scheduler:
|
|
|
436
482
|
self.jobs: Dict[str, "Job"] = {}
|
|
437
483
|
|
|
438
484
|
# List of jobs
|
|
439
|
-
self.waitingjobs = set()
|
|
485
|
+
self.waitingjobs: Set[Job] = set()
|
|
440
486
|
|
|
441
487
|
# Listeners
|
|
442
488
|
self.listeners: Set[Listener] = set()
|
|
@@ -459,14 +505,17 @@ class Scheduler:
|
|
|
459
505
|
|
|
460
506
|
def submit(self, job: Job) -> Optional[Job]:
|
|
461
507
|
# Wait for the future containing the submitted job
|
|
508
|
+
logger.debug("Registering the job %s within the scheduler", job)
|
|
462
509
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
463
510
|
self.aio_registerJob(job), self.loop
|
|
464
511
|
)
|
|
465
512
|
other = otherFuture.result()
|
|
513
|
+
logger.debug("Job already submitted" if other else "First submission")
|
|
466
514
|
if other:
|
|
467
515
|
return other
|
|
468
516
|
|
|
469
517
|
job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
|
|
518
|
+
return None
|
|
470
519
|
|
|
471
520
|
def prepare(self, job: Job):
|
|
472
521
|
"""Prepares the job for running"""
|
|
@@ -598,9 +647,13 @@ class Scheduler:
|
|
|
598
647
|
if job.state != JobState.DONE:
|
|
599
648
|
self.xp.failedJobs[job.identifier] = job
|
|
600
649
|
|
|
650
|
+
# Process all remaining tasks outputs
|
|
651
|
+
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
652
|
+
|
|
601
653
|
# Decrement the number of unfinished jobs and notify
|
|
602
654
|
self.xp.unfinishedJobs -= 1
|
|
603
655
|
async with self.xp.central.exitCondition:
|
|
656
|
+
logging.debug("Updated number of unfinished jobs")
|
|
604
657
|
self.xp.central.exitCondition.notify_all()
|
|
605
658
|
|
|
606
659
|
job.endtime = time.time()
|
|
@@ -688,6 +741,7 @@ class Scheduler:
|
|
|
688
741
|
code = await process.aio_code()
|
|
689
742
|
logger.debug("Got return code %s for %s", code, job)
|
|
690
743
|
|
|
744
|
+
# Check the file if there is no return code
|
|
691
745
|
if code is None:
|
|
692
746
|
# Case where we cannot retrieve the code right away
|
|
693
747
|
if job.donepath.is_file():
|
|
@@ -766,6 +820,7 @@ class experiment:
|
|
|
766
820
|
"""
|
|
767
821
|
|
|
768
822
|
from experimaestro.server import Server
|
|
823
|
+
from experimaestro.scheduler import Listener
|
|
769
824
|
|
|
770
825
|
settings = get_settings()
|
|
771
826
|
if not isinstance(env, WorkspaceSettings):
|
|
@@ -782,6 +837,7 @@ class experiment:
|
|
|
782
837
|
self.xplock = None
|
|
783
838
|
self.old_experiment = None
|
|
784
839
|
self.services: Dict[str, Service] = {}
|
|
840
|
+
self._job_listener: Optional[Listener] = None
|
|
785
841
|
|
|
786
842
|
# Get configuration settings
|
|
787
843
|
|
|
@@ -853,6 +909,7 @@ class experiment:
|
|
|
853
909
|
assert self.central is not None
|
|
854
910
|
async with self.central.exitCondition:
|
|
855
911
|
self.exitMode = True
|
|
912
|
+
logging.debug("Setting exit mode to true")
|
|
856
913
|
self.central.exitCondition.notify_all()
|
|
857
914
|
|
|
858
915
|
assert self.central is not None and self.central.loop is not None
|
|
@@ -863,10 +920,22 @@ class experiment:
|
|
|
863
920
|
|
|
864
921
|
async def awaitcompletion():
|
|
865
922
|
assert self.central is not None
|
|
923
|
+
logger.debug("Waiting to exit scheduler...")
|
|
866
924
|
async with self.central.exitCondition:
|
|
867
925
|
while True:
|
|
868
|
-
if self.
|
|
926
|
+
if self.exitMode:
|
|
927
|
+
break
|
|
928
|
+
|
|
929
|
+
# If we have still unfinished jobs or possible new tasks, wait
|
|
930
|
+
logger.debug(
|
|
931
|
+
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
932
|
+
self.unfinishedJobs,
|
|
933
|
+
self.taskOutputQueueSize,
|
|
934
|
+
)
|
|
935
|
+
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
869
936
|
break
|
|
937
|
+
|
|
938
|
+
# Wait for more news...
|
|
870
939
|
await self.central.exitCondition.wait()
|
|
871
940
|
|
|
872
941
|
if self.failedJobs:
|
|
@@ -898,6 +967,8 @@ class experiment:
|
|
|
898
967
|
return self.workspace.connector.createtoken(name, count)
|
|
899
968
|
|
|
900
969
|
def __enter__(self):
|
|
970
|
+
from .dynamic_outputs import TaskOutputsWorker
|
|
971
|
+
|
|
901
972
|
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
902
973
|
logger.info("Locking experiment %s", self.xplockpath)
|
|
903
974
|
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
@@ -926,6 +997,7 @@ class experiment:
|
|
|
926
997
|
global SIGNAL_HANDLER
|
|
927
998
|
# Number of unfinished jobs
|
|
928
999
|
self.unfinishedJobs = 0
|
|
1000
|
+
self.taskOutputQueueSize = 0
|
|
929
1001
|
|
|
930
1002
|
# List of failed jobs
|
|
931
1003
|
self.failedJobs: Dict[str, Job] = {}
|
|
@@ -934,6 +1006,8 @@ class experiment:
|
|
|
934
1006
|
self.exitMode = False
|
|
935
1007
|
|
|
936
1008
|
self.central = SchedulerCentral.create(self.scheduler.name)
|
|
1009
|
+
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
1010
|
+
self.taskOutputsWorker.start()
|
|
937
1011
|
|
|
938
1012
|
SIGNAL_HANDLER.add(self)
|
|
939
1013
|
|
|
@@ -942,6 +1016,7 @@ class experiment:
|
|
|
942
1016
|
return self
|
|
943
1017
|
|
|
944
1018
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
1019
|
+
logger.debug("Exiting scheduler context")
|
|
945
1020
|
# If no exception and normal run mode, remove old "jobs"
|
|
946
1021
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
947
1022
|
if exc_type is None and self.jobsbakpath.is_dir():
|
|
@@ -967,8 +1042,13 @@ class experiment:
|
|
|
967
1042
|
service.stop()
|
|
968
1043
|
|
|
969
1044
|
if self.central is not None:
|
|
1045
|
+
logger.info("Stopping scheduler event loop")
|
|
970
1046
|
self.central.loop.stop()
|
|
971
1047
|
|
|
1048
|
+
if self.taskOutputsWorker is not None:
|
|
1049
|
+
logger.info("Stopping tasks outputs worker")
|
|
1050
|
+
self.taskOutputsWorker.queue.put(None)
|
|
1051
|
+
|
|
972
1052
|
self.central = None
|
|
973
1053
|
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
974
1054
|
if self.xplock:
|
|
@@ -977,8 +1057,36 @@ class experiment:
|
|
|
977
1057
|
# Put back old experiment as current one
|
|
978
1058
|
experiment.CURRENT = self.old_experiment
|
|
979
1059
|
if self.server:
|
|
1060
|
+
logger.info("Stopping web server")
|
|
980
1061
|
self.server.stop()
|
|
981
1062
|
|
|
1063
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
1064
|
+
# Write the state
|
|
1065
|
+
logging.info("Saving the experiment state")
|
|
1066
|
+
from experimaestro.scheduler.state import ExperimentState
|
|
1067
|
+
|
|
1068
|
+
ExperimentState.save(
|
|
1069
|
+
self.workdir / "state.json", self.scheduler.jobs.values()
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
async def update_task_output_count(self, delta: int):
|
|
1073
|
+
"""Change in the number of task outputs to process"""
|
|
1074
|
+
async with self.central.exitCondition:
|
|
1075
|
+
self.taskOutputQueueSize += delta
|
|
1076
|
+
logging.debug(
|
|
1077
|
+
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
1078
|
+
)
|
|
1079
|
+
if self.taskOutputQueueSize == 0:
|
|
1080
|
+
self.central.exitCondition.notify_all()
|
|
1081
|
+
|
|
1082
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
1083
|
+
"""Watch an output
|
|
1084
|
+
|
|
1085
|
+
:param watched: The watched output specification
|
|
1086
|
+
"""
|
|
1087
|
+
|
|
1088
|
+
self.taskOutputsWorker.watch_output(watched)
|
|
1089
|
+
|
|
982
1090
|
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
983
1091
|
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
984
1092
|
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Handles dynamic task outputs"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import queue
|
|
7
|
+
import threading
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from functools import cached_property
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from watchdog.events import FileSystemEventHandler
|
|
14
|
+
|
|
15
|
+
from experimaestro.ipc import ipcom
|
|
16
|
+
from experimaestro.utils import logger
|
|
17
|
+
|
|
18
|
+
from .base import Job, experiment
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from experimaestro.core.objects import WatchedOutput
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TaskOutputCallbackHandler:
|
|
25
|
+
def __init__(self, converter: Callable):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TaskOutputs(FileSystemEventHandler):
|
|
30
|
+
"""Represent and monitors dynamic outputs generated by one task"""
|
|
31
|
+
|
|
32
|
+
#: Global dictionary for handles
|
|
33
|
+
HANDLERS: dict[Path, "TaskOutputs"] = {}
|
|
34
|
+
|
|
35
|
+
#: Global lock to access current HANDLERS
|
|
36
|
+
LOCK = threading.Lock()
|
|
37
|
+
|
|
38
|
+
def create(job: Job):
|
|
39
|
+
with TaskOutputs.LOCK:
|
|
40
|
+
if instance := TaskOutputs.get(job.task_outputs_path, None):
|
|
41
|
+
return instance
|
|
42
|
+
|
|
43
|
+
instance = TaskOutputs(job.task_outputs_path)
|
|
44
|
+
TaskOutputs[job.task_outputs_path] = instance
|
|
45
|
+
return instance
|
|
46
|
+
|
|
47
|
+
def __init__(self, path: Path):
|
|
48
|
+
"""Monitors an event path"""
|
|
49
|
+
logger.debug("Watching dynamic task outputs in %s", path)
|
|
50
|
+
self.path = path
|
|
51
|
+
self.handle = None
|
|
52
|
+
self.count = 0
|
|
53
|
+
self.lock = threading.Lock()
|
|
54
|
+
self.listeners: dict[str, dict[Callable, set[Callable]]] = defaultdict(
|
|
55
|
+
lambda: defaultdict(set)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
#: The events registered so far
|
|
59
|
+
self.events = []
|
|
60
|
+
|
|
61
|
+
def __enter__(self):
|
|
62
|
+
"""Starts monitoring task outputs"""
|
|
63
|
+
self.job.task_outputs_path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with self.lock:
|
|
65
|
+
if self.handle is None:
|
|
66
|
+
assert self.count == 0
|
|
67
|
+
self.handle = ipcom().fswatch(self, self.path.parent, False)
|
|
68
|
+
self.count += 1
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def __exit__(self, *args):
|
|
72
|
+
"""Stops monitoring task outputs"""
|
|
73
|
+
with self.lock:
|
|
74
|
+
self.count -= 1
|
|
75
|
+
if self.count == 0:
|
|
76
|
+
ipcom().fsunwatch(self.handle)
|
|
77
|
+
self.fh.close()
|
|
78
|
+
|
|
79
|
+
self.handle = None
|
|
80
|
+
self._fh = None
|
|
81
|
+
|
|
82
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
83
|
+
"""Add a new listener"""
|
|
84
|
+
key = f"{watched.config.__identifier__}/{watched.method_name}"
|
|
85
|
+
with self.lock:
|
|
86
|
+
# Process events so far
|
|
87
|
+
listener = self.listeners[key].get(watched.method, None)
|
|
88
|
+
if listener is None:
|
|
89
|
+
listener = TaskOutputCallbackHandler(watched.method)
|
|
90
|
+
|
|
91
|
+
# Register
|
|
92
|
+
self.listeners[key][watched.method].add(watched.callback)
|
|
93
|
+
|
|
94
|
+
#
|
|
95
|
+
# --- Events
|
|
96
|
+
#
|
|
97
|
+
|
|
98
|
+
@cached_property
|
|
99
|
+
def fh(self):
|
|
100
|
+
if self._fh is None:
|
|
101
|
+
self._fh = self.path.open("rt")
|
|
102
|
+
return self._fh
|
|
103
|
+
|
|
104
|
+
def on_modified(self, event):
|
|
105
|
+
self.handle(Path(event.src_path))
|
|
106
|
+
|
|
107
|
+
def on_created(self, event):
|
|
108
|
+
self.handle(Path(event.src_path))
|
|
109
|
+
|
|
110
|
+
def handle(self, path: Path):
|
|
111
|
+
if path != self.path:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
with self.lock:
|
|
115
|
+
logger.debug("[TASK OUTPUT] Handling task output for %s", self.path)
|
|
116
|
+
|
|
117
|
+
while json_line := self.fh.readline():
|
|
118
|
+
# Read the event
|
|
119
|
+
event = json.loads(json_line)
|
|
120
|
+
logger.debug("Event: %s", event)
|
|
121
|
+
|
|
122
|
+
# FIXME: move elsewhere
|
|
123
|
+
# # Process the event
|
|
124
|
+
# event = self.config_method(
|
|
125
|
+
# self.job.config.__xpm__.mark_output,
|
|
126
|
+
# *event["args"],
|
|
127
|
+
# **event["kwargs"],
|
|
128
|
+
# )
|
|
129
|
+
|
|
130
|
+
self.events.append(event)
|
|
131
|
+
# self.job.scheduler.xp.taskOutputsWorker.add(self, event)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TaskOutputsWorker(threading.Thread):
|
|
135
|
+
"""This worker process dynamic output queue for one experiment"""
|
|
136
|
+
|
|
137
|
+
def __init__(self, xp: experiment):
|
|
138
|
+
super().__init__(name="task outputs worker", daemon=True)
|
|
139
|
+
self.queue = queue.Queue()
|
|
140
|
+
self.xp = xp
|
|
141
|
+
|
|
142
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
143
|
+
"""Watch an output
|
|
144
|
+
|
|
145
|
+
:param watched: The watched output specification
|
|
146
|
+
"""
|
|
147
|
+
logger.debug("Registering task output listener %s", watched)
|
|
148
|
+
|
|
149
|
+
# path = watched.job.tasks_output_path
|
|
150
|
+
TaskOutputs.create(watched.job).watch_output(watched)
|
|
151
|
+
|
|
152
|
+
def add(self, watcher, event):
|
|
153
|
+
asyncio.run_coroutine_threadsafe(
|
|
154
|
+
self.xp.update_task_output_count(1),
|
|
155
|
+
self.xp.scheduler.loop,
|
|
156
|
+
).result()
|
|
157
|
+
self.queue.put((watcher, event))
|
|
158
|
+
|
|
159
|
+
def run(self):
|
|
160
|
+
logging.debug("Starting output listener queue")
|
|
161
|
+
while True:
|
|
162
|
+
# Get the next element in the queue
|
|
163
|
+
element = self.queue.get()
|
|
164
|
+
if element is None:
|
|
165
|
+
# end of processing
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
# Call all the listeners
|
|
169
|
+
logging.debug("Got one event: %s", element)
|
|
170
|
+
watcher, event = element
|
|
171
|
+
for listener in watcher.listeners:
|
|
172
|
+
try:
|
|
173
|
+
logger.debug("Calling listener [%s] with %s", listener, event)
|
|
174
|
+
listener(event)
|
|
175
|
+
logger.debug(
|
|
176
|
+
"[done] Calling listener [%s] with %s", listener, event
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
logging.exception("Exception while calling the listener")
|
|
180
|
+
self.queue.task_done()
|
|
181
|
+
|
|
182
|
+
asyncio.run_coroutine_threadsafe(
|
|
183
|
+
self.xp.update_task_output_count(-1), self.xp.scheduler.loop
|
|
184
|
+
).result()
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable, Optional, Type
|
|
5
|
+
from experimaestro import Task
|
|
6
|
+
|
|
7
|
+
from experimaestro.core.context import SerializationContext
|
|
8
|
+
from experimaestro.scheduler.base import Job, JobDependency
|
|
9
|
+
from experimaestro.settings import find_workspace
|
|
10
|
+
from experimaestro.core.serialization import from_state_dict, save_definition
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class JobInformation:
|
|
15
|
+
id: str
|
|
16
|
+
path: Path
|
|
17
|
+
task: Task
|
|
18
|
+
tags: dict[str, str]
|
|
19
|
+
depends_on: list["JobInformation"] = field(default_factory=list)
|
|
20
|
+
|
|
21
|
+
def __post_init__(self):
|
|
22
|
+
self.path = Path(self.path)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ExperimentState:
|
|
26
|
+
def __init__(self, workdir: Path, name: str):
|
|
27
|
+
path = workdir / "xp" / name / "state.json"
|
|
28
|
+
with path.open("rt") as fh:
|
|
29
|
+
content = json.load(fh)
|
|
30
|
+
|
|
31
|
+
self.states: dict[str, JobInformation] = {
|
|
32
|
+
state_dict["id"]: JobInformation(**state_dict)
|
|
33
|
+
for state_dict in from_state_dict(content, as_instance=False)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
for state in self.states.values():
|
|
37
|
+
state.depends_on = [self.states[key] for key in state.depends_on]
|
|
38
|
+
|
|
39
|
+
def get_jobs(self, task_class: Type[Task]) -> list[JobInformation]:
|
|
40
|
+
if task_class is None:
|
|
41
|
+
return list(self.data.values())
|
|
42
|
+
|
|
43
|
+
tasks = []
|
|
44
|
+
for job_state in self.states.values():
|
|
45
|
+
if isinstance(job_state.task, task_class):
|
|
46
|
+
tasks.append(job_state)
|
|
47
|
+
return tasks
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def save(path: Path, jobs: Iterable[Job]):
|
|
51
|
+
save_definition(
|
|
52
|
+
[
|
|
53
|
+
{
|
|
54
|
+
"id": job.identifier,
|
|
55
|
+
"path": str(job.path),
|
|
56
|
+
"task": job.config,
|
|
57
|
+
"tags": job.config.__xpm__.tags(),
|
|
58
|
+
"depends_on": list(
|
|
59
|
+
dep.origin.identifier
|
|
60
|
+
for dep in job.dependencies
|
|
61
|
+
if isinstance(dep, JobDependency)
|
|
62
|
+
),
|
|
63
|
+
}
|
|
64
|
+
for job in jobs
|
|
65
|
+
],
|
|
66
|
+
SerializationContext(),
|
|
67
|
+
path,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_experiment(
|
|
72
|
+
name: str, *, workspace: Optional[str] = None, workdir: Optional[Path] = None
|
|
73
|
+
) -> ExperimentState:
|
|
74
|
+
ws = find_workspace(workspace=workspace, workdir=workdir)
|
|
75
|
+
return ExperimentState(ws.path, name)
|
|
@@ -2,7 +2,7 @@ from collections import ChainMap
|
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from functools import cached_property
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Iterator, Optional
|
|
6
6
|
from experimaestro.settings import WorkspaceSettings, Settings
|
|
7
7
|
|
|
8
8
|
|
|
@@ -46,6 +46,7 @@ class Workspace:
|
|
|
46
46
|
path = path.absolute()
|
|
47
47
|
self.path = path
|
|
48
48
|
self.run_mode = run_mode
|
|
49
|
+
self.python_path = []
|
|
49
50
|
from ..launchers import Launcher
|
|
50
51
|
|
|
51
52
|
self.launcher = launcher or Launcher.get(path)
|
experimaestro/scriptbuilder.py
CHANGED
|
@@ -51,6 +51,8 @@ class PythonScriptBuilder:
|
|
|
51
51
|
self.lockfiles: List[Path] = []
|
|
52
52
|
self.notificationURL: Optional[str] = None
|
|
53
53
|
self.command: Optional[AbstractCommand] = None
|
|
54
|
+
|
|
55
|
+
# This is used to serialize the full process identifier on disk
|
|
54
56
|
self.processtype = "local"
|
|
55
57
|
|
|
56
58
|
def write(self, job: CommandLineJob):
|
|
@@ -63,7 +65,7 @@ class PythonScriptBuilder:
|
|
|
63
65
|
job {CommandLineJob} -- [description]
|
|
64
66
|
|
|
65
67
|
Returns:
|
|
66
|
-
|
|
68
|
+
str -- The script path on disk
|
|
67
69
|
"""
|
|
68
70
|
assert isinstance(
|
|
69
71
|
job, CommandLineJob
|
|
@@ -94,6 +96,7 @@ class PythonScriptBuilder:
|
|
|
94
96
|
out.write("# Experimaestro generated task\n\n")
|
|
95
97
|
out.write(
|
|
96
98
|
"""import logging\n"""
|
|
99
|
+
"""import sys\n"""
|
|
97
100
|
"""logging.basicConfig(level=logging.INFO, """
|
|
98
101
|
"""format='%(levelname)s:%(process)d:%(asctime)s [%(name)s] %(message)s', datefmt='%y-%m-%d %H:%M:%S')\n\n"""
|
|
99
102
|
)
|
|
@@ -112,9 +115,17 @@ class PythonScriptBuilder:
|
|
|
112
115
|
out.write(" ]\n")
|
|
113
116
|
|
|
114
117
|
for name, value in job.environ.items():
|
|
115
|
-
|
|
118
|
+
if name == "PYTHONPATH":
|
|
119
|
+
# Handles properly python path
|
|
120
|
+
for path in value.split(":"):
|
|
121
|
+
out.write(f""" sys.path.insert(0, "{shquote(path)}")\n""")
|
|
122
|
+
else:
|
|
123
|
+
out.write(f""" os.environ["{name}"] = "{shquote(value)}"\n""")
|
|
116
124
|
out.write("\n")
|
|
117
125
|
|
|
126
|
+
for path in job.python_path:
|
|
127
|
+
out.write(f""" sys.path.insert(0, "{shquote(str(path))}")\n""")
|
|
128
|
+
|
|
118
129
|
out.write(
|
|
119
130
|
f""" TaskRunner("{shquote(connector.resolve(scriptpath))}","""
|
|
120
131
|
""" lockfiles).run()\n"""
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|