experimaestro 1.6.2__py3-none-any.whl → 1.7.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (71) hide show
  1. experimaestro/__init__.py +3 -1
  2. experimaestro/annotations.py +13 -3
  3. experimaestro/cli/filter.py +3 -3
  4. experimaestro/cli/jobs.py +1 -1
  5. experimaestro/connectors/__init__.py +17 -8
  6. experimaestro/connectors/local.py +8 -3
  7. experimaestro/core/arguments.py +26 -3
  8. experimaestro/core/objects.py +90 -6
  9. experimaestro/core/objects.pyi +7 -1
  10. experimaestro/core/types.py +33 -2
  11. experimaestro/generators.py +6 -1
  12. experimaestro/ipc.py +4 -1
  13. experimaestro/launcherfinder/registry.py +18 -4
  14. experimaestro/notifications.py +1 -1
  15. experimaestro/run.py +1 -1
  16. experimaestro/scheduler/base.py +94 -10
  17. experimaestro/scheduler/dynamic_outputs.py +184 -0
  18. experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
  19. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  20. experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
  21. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  22. experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
  23. experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
  24. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  25. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  26. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  27. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  28. experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
  29. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  30. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  31. experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
  32. experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
  33. experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
  34. experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
  35. experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
  36. experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
  37. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  38. experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
  39. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  40. experimaestro/server/data/favicon.ico +0 -0
  41. experimaestro/server/data/index.css +22963 -0
  42. experimaestro/server/data/index.css.map +1 -0
  43. experimaestro/server/data/index.html +27 -0
  44. experimaestro/server/data/index.js +101770 -0
  45. experimaestro/server/data/index.js.map +1 -0
  46. experimaestro/server/data/login.html +22 -0
  47. experimaestro/server/data/manifest.json +15 -0
  48. experimaestro/taskglobals.py +7 -2
  49. experimaestro/tests/definitions_types.py +5 -3
  50. experimaestro/tests/launchers/bin/sbatch +18 -5
  51. experimaestro/tests/launchers/common.py +11 -3
  52. experimaestro/tests/restart.py +6 -3
  53. experimaestro/tests/tasks/all.py +16 -10
  54. experimaestro/tests/tasks/foreign.py +2 -4
  55. experimaestro/tests/test_forward.py +5 -5
  56. experimaestro/tests/test_identifier.py +61 -66
  57. experimaestro/tests/test_instance.py +3 -6
  58. experimaestro/tests/test_param.py +40 -22
  59. experimaestro/tests/test_tags.py +5 -11
  60. experimaestro/tests/test_tokens.py +3 -2
  61. experimaestro/tests/test_types.py +17 -14
  62. experimaestro/tests/test_validation.py +48 -91
  63. experimaestro/tokens.py +16 -5
  64. experimaestro/typingutils.py +7 -0
  65. experimaestro/utils/asyncio.py +6 -2
  66. experimaestro/utils/resources.py +7 -3
  67. {experimaestro-1.6.2.dist-info → experimaestro-1.7.0rc1.dist-info}/METADATA +3 -4
  68. {experimaestro-1.6.2.dist-info → experimaestro-1.7.0rc1.dist-info}/RECORD +71 -40
  69. {experimaestro-1.6.2.dist-info → experimaestro-1.7.0rc1.dist-info}/WHEEL +1 -1
  70. {experimaestro-1.6.2.dist-info → experimaestro-1.7.0rc1.dist-info}/LICENSE +0 -0
  71. {experimaestro-1.6.2.dist-info → experimaestro-1.7.0rc1.dist-info}/entry_points.txt +0 -0
@@ -7,7 +7,16 @@ from pathlib import Path
7
7
  from shutil import rmtree
8
8
  import threading
9
9
  import time
10
- from typing import Any, Iterator, List, Optional, Set, TypeVar, Union, TYPE_CHECKING
10
+ from typing import (
11
+ Any,
12
+ Iterator,
13
+ List,
14
+ Optional,
15
+ Set,
16
+ TypeVar,
17
+ Union,
18
+ TYPE_CHECKING,
19
+ )
11
20
  import enum
12
21
  import signal
13
22
  import asyncio
@@ -18,9 +27,10 @@ from experimaestro.scheduler.services import Service
18
27
  from experimaestro.settings import WorkspaceSettings, get_settings
19
28
 
20
29
 
21
- from experimaestro.core.objects import Config, ConfigWalkContext
30
+ from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
22
31
  from experimaestro.utils import logger
23
32
  from experimaestro.locking import Locks, LockError, Lock
33
+ from experimaestro.utils.asyncio import asyncThreadcheck
24
34
  from .workspace import RunMode, Workspace
25
35
  from .dependencies import Dependency, DependencyStatus, Resource
26
36
  import concurrent.futures
@@ -111,7 +121,7 @@ class JobDependency(Dependency):
111
121
 
112
122
 
113
123
  class Job(Resource):
114
- """A job is a resouce that is produced by the execution of some code"""
124
+ """A job is a resource that is produced by the execution of some code"""
115
125
 
116
126
  # Set by the scheduler
117
127
  _readyEvent: Optional[asyncio.Event]
@@ -149,6 +159,11 @@ class Job(Resource):
149
159
  # Dependencies
150
160
  self.dependencies: Set[Dependency] = set() # as target
151
161
 
162
+ # Watched outputs
163
+ self.watched_outputs = {}
164
+ for watched in config.__xpm__.watched_outputs:
165
+ self.watch_output(watched)
166
+
152
167
  # Process
153
168
  self._process = None
154
169
  self.unsatisfied = 0
@@ -160,6 +175,23 @@ class Job(Resource):
160
175
  self._progress: List[LevelInformation] = []
161
176
  self.tags = config.tags()
162
177
 
178
+ def watch_output(self, watched: "WatchedOutput"):
179
+ """Monitor task outputs
180
+
181
+ :param watched: A description of the watched output
182
+ """
183
+ self.scheduler.xp.watch_output(watched)
184
+
185
+ def task_output_update(self, subpath: Path):
186
+ """Notification of an updated task output"""
187
+ if watcher := self.watched_outputs.get(subpath, None):
188
+ watcher.update()
189
+
190
+ def done_handler(self):
191
+ """The task has been completed"""
192
+ for watcher in self.watched_outputs.values():
193
+ watcher.update()
194
+
163
195
  def __str__(self):
164
196
  return "Job[{}]".format(self.identifier)
165
197
 
@@ -170,10 +202,8 @@ class Job(Resource):
170
202
  @cached_property
171
203
  def python_path(self) -> Iterator[str]:
172
204
  """Returns an iterator over python path"""
173
- return itertools.chain(
174
- self.workspace.python_path
175
- )
176
-
205
+ return itertools.chain(self.workspace.python_path)
206
+
177
207
  @cached_property
178
208
  def environ(self):
179
209
  """Returns the job environment
@@ -227,7 +257,7 @@ class Job(Resource):
227
257
  return self.state == JobState.READY
228
258
 
229
259
  @property
230
- def jobpath(self):
260
+ def jobpath(self) -> Path:
231
261
  """Deprecated, use `path`"""
232
262
  return self.workspace.jobspath / self.relpath
233
263
 
@@ -235,6 +265,14 @@ class Job(Resource):
235
265
  def path(self) -> Path:
236
266
  return self.workspace.jobspath / self.relpath
237
267
 
268
+ @property
269
+ def experimaestro_path(self) -> Path:
270
+ return (self.path / ".experimaestro").resolve()
271
+
272
+ @cached_property
273
+ def task_outputs_path(self) -> Path:
274
+ return self.experimaestro_path / "task-outputs.jsonl"
275
+
238
276
  @property
239
277
  def relpath(self):
240
278
  identifier = self.config.__xpm__.identifier
@@ -444,7 +482,7 @@ class Scheduler:
444
482
  self.jobs: Dict[str, "Job"] = {}
445
483
 
446
484
  # List of jobs
447
- self.waitingjobs = set()
485
+ self.waitingjobs: Set[Job] = set()
448
486
 
449
487
  # Listeners
450
488
  self.listeners: Set[Listener] = set()
@@ -467,10 +505,12 @@ class Scheduler:
467
505
 
468
506
  def submit(self, job: Job) -> Optional[Job]:
469
507
  # Wait for the future containing the submitted job
508
+ logger.debug("Registering the job %s within the scheduler", job)
470
509
  otherFuture = asyncio.run_coroutine_threadsafe(
471
510
  self.aio_registerJob(job), self.loop
472
511
  )
473
512
  other = otherFuture.result()
513
+ logger.debug("Job already submitted" if other else "First submission")
474
514
  if other:
475
515
  return other
476
516
 
@@ -606,9 +646,13 @@ class Scheduler:
606
646
  if job.state != JobState.DONE:
607
647
  self.xp.failedJobs[job.identifier] = job
608
648
 
649
+ # Process all remaining tasks outputs
650
+ await asyncThreadcheck("End of job processing", job.done_handler)
651
+
609
652
  # Decrement the number of unfinished jobs and notify
610
653
  self.xp.unfinishedJobs -= 1
611
654
  async with self.xp.central.exitCondition:
655
+ logging.debug("Updated number of unfinished jobs")
612
656
  self.xp.central.exitCondition.notify_all()
613
657
 
614
658
  job.endtime = time.time()
@@ -861,6 +905,7 @@ class experiment:
861
905
  assert self.central is not None
862
906
  async with self.central.exitCondition:
863
907
  self.exitMode = True
908
+ logging.debug("Setting exit mode to true")
864
909
  self.central.exitCondition.notify_all()
865
910
 
866
911
  assert self.central is not None and self.central.loop is not None
@@ -873,8 +918,19 @@ class experiment:
873
918
  assert self.central is not None
874
919
  async with self.central.exitCondition:
875
920
  while True:
876
- if self.unfinishedJobs == 0 or self.exitMode:
921
+ if self.exitMode:
877
922
  break
923
+
924
+ # If we have still unfinished jobs or possible new tasks, wait
925
+ logger.debug(
926
+ "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
927
+ self.unfinishedJobs,
928
+ self.taskOutputQueueSize,
929
+ )
930
+ if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
931
+ break
932
+
933
+ # Wait for more news...
878
934
  await self.central.exitCondition.wait()
879
935
 
880
936
  if self.failedJobs:
@@ -906,6 +962,8 @@ class experiment:
906
962
  return self.workspace.connector.createtoken(name, count)
907
963
 
908
964
  def __enter__(self):
965
+ from .dynamic_outputs import TaskOutputsWorker
966
+
909
967
  if self.workspace.run_mode != RunMode.DRY_RUN:
910
968
  logger.info("Locking experiment %s", self.xplockpath)
911
969
  self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
@@ -934,6 +992,7 @@ class experiment:
934
992
  global SIGNAL_HANDLER
935
993
  # Number of unfinished jobs
936
994
  self.unfinishedJobs = 0
995
+ self.taskOutputQueueSize = 0
937
996
 
938
997
  # List of failed jobs
939
998
  self.failedJobs: Dict[str, Job] = {}
@@ -942,6 +1001,8 @@ class experiment:
942
1001
  self.exitMode = False
943
1002
 
944
1003
  self.central = SchedulerCentral.create(self.scheduler.name)
1004
+ self.taskOutputsWorker = TaskOutputsWorker(self)
1005
+ self.taskOutputsWorker.start()
945
1006
 
946
1007
  SIGNAL_HANDLER.add(self)
947
1008
 
@@ -975,8 +1036,13 @@ class experiment:
975
1036
  service.stop()
976
1037
 
977
1038
  if self.central is not None:
1039
+ logger.info("Stopping scheduler event loop")
978
1040
  self.central.loop.stop()
979
1041
 
1042
+ if self.taskOutputsWorker is not None:
1043
+ logger.info("Stopping tasks outputs worker")
1044
+ self.taskOutputsWorker.queue.put(None)
1045
+
980
1046
  self.central = None
981
1047
  self.workspace.__exit__(exc_type, exc_value, traceback)
982
1048
  if self.xplock:
@@ -987,6 +1053,24 @@ class experiment:
987
1053
  if self.server:
988
1054
  self.server.stop()
989
1055
 
1056
+ async def update_task_output_count(self, delta: int):
1057
+ """Change in the number of task outputs to process"""
1058
+ async with self.central.exitCondition:
1059
+ self.taskOutputQueueSize += delta
1060
+ logging.debug(
1061
+ "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
1062
+ )
1063
+ if self.taskOutputQueueSize == 0:
1064
+ self.central.exitCondition.notify_all()
1065
+
1066
+ def watch_output(self, watched: "WatchedOutput"):
1067
+ """Watch an output
1068
+
1069
+ :param watched: The watched output specification
1070
+ """
1071
+
1072
+ self.taskOutputsWorker.watch_output(watched)
1073
+
990
1074
  def add_service(self, service: ServiceClass) -> ServiceClass:
991
1075
  """Adds a service (e.g. tensorboard viewer) to the experiment
992
1076
 
@@ -0,0 +1,184 @@
1
+ """Handles dynamic task outputs"""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import queue
7
+ import threading
8
+ from collections import defaultdict
9
+ from functools import cached_property
10
+ from pathlib import Path
11
+ from typing import Callable, TYPE_CHECKING
12
+
13
+ from watchdog.events import FileSystemEventHandler
14
+
15
+ from experimaestro.ipc import ipcom
16
+ from experimaestro.utils import logger
17
+
18
+ from .base import Job, experiment
19
+
20
+ if TYPE_CHECKING:
21
+ from experimaestro.core.objects import WatchedOutput
22
+
23
+
24
+ class TaskOutputCallbackHandler:
25
+ def __init__(self, converter: Callable):
26
+ pass
27
+
28
+
29
+ class TaskOutputs(FileSystemEventHandler):
30
+ """Represent and monitors dynamic outputs generated by one task"""
31
+
32
+ #: Global dictionary for handles
33
+ HANDLERS: dict[Path, "TaskOutputs"] = {}
34
+
35
+ #: Global lock to access current HANDLERS
36
+ LOCK = threading.Lock()
37
+
38
+ def create(job: Job):
39
+ with TaskOutputs.LOCK:
40
+ if instance := TaskOutputs.get(job.task_outputs_path, None):
41
+ return instance
42
+
43
+ instance = TaskOutputs(job.task_outputs_path)
44
+ TaskOutputs[job.task_outputs_path] = instance
45
+ return instance
46
+
47
+ def __init__(self, path: Path):
48
+ """Monitors an event path"""
49
+ logger.debug("Watching dynamic task outputs in %s", path)
50
+ self.path = path
51
+ self.handle = None
52
+ self.count = 0
53
+ self.lock = threading.Lock()
54
+ self.listeners: dict[str, dict[Callable, set[Callable]]] = defaultdict(
55
+ lambda: defaultdict(set)
56
+ )
57
+
58
+ #: The events registered so far
59
+ self.events = []
60
+
61
+ def __enter__(self):
62
+ """Starts monitoring task outputs"""
63
+ self.job.task_outputs_path.parent.mkdir(parents=True, exist_ok=True)
64
+ with self.lock:
65
+ if self.handle is None:
66
+ assert self.count == 0
67
+ self.handle = ipcom().fswatch(self, self.path.parent, False)
68
+ self.count += 1
69
+ return self
70
+
71
+ def __exit__(self, *args):
72
+ """Stops monitoring task outputs"""
73
+ with self.lock:
74
+ self.count -= 1
75
+ if self.count == 0:
76
+ ipcom().fsunwatch(self.handle)
77
+ self.fh.close()
78
+
79
+ self.handle = None
80
+ self._fh = None
81
+
82
+ def watch_output(self, watched: "WatchedOutput"):
83
+ """Add a new listener"""
84
+ key = f"{watched.config.__identifier__}/{watched.method_name}"
85
+ with self.lock:
86
+ # Process events so far
87
+ listener = self.listeners[key].get(watched.method, None)
88
+ if listener is None:
89
+ listener = TaskOutputCallbackHandler(watched.method)
90
+
91
+ # Register
92
+ self.listeners[key][watched.method].add(watched.callback)
93
+
94
+ #
95
+ # --- Events
96
+ #
97
+
98
+ @cached_property
99
+ def fh(self):
100
+ if self._fh is None:
101
+ self._fh = self.path.open("rt")
102
+ return self._fh
103
+
104
+ def on_modified(self, event):
105
+ self.handle(Path(event.src_path))
106
+
107
+ def on_created(self, event):
108
+ self.handle(Path(event.src_path))
109
+
110
+ def handle(self, path: Path):
111
+ if path != self.path:
112
+ return
113
+
114
+ with self.lock:
115
+ logger.debug("[TASK OUTPUT] Handling task output for %s", self.path)
116
+
117
+ while json_line := self.fh.readline():
118
+ # Read the event
119
+ event = json.loads(json_line)
120
+ logger.debug("Event: %s", event)
121
+
122
+ # FIXME: move elsewhere
123
+ # # Process the event
124
+ # event = self.config_method(
125
+ # self.job.config.__xpm__.mark_output,
126
+ # *event["args"],
127
+ # **event["kwargs"],
128
+ # )
129
+
130
+ self.events.append(event)
131
+ # self.job.scheduler.xp.taskOutputsWorker.add(self, event)
132
+
133
+
134
+ class TaskOutputsWorker(threading.Thread):
135
+ """This worker process dynamic output queue for one experiment"""
136
+
137
+ def __init__(self, xp: experiment):
138
+ super().__init__(name="task outputs worker")
139
+ self.queue = queue.Queue()
140
+ self.xp = xp
141
+
142
+ def watch_output(self, watched: "WatchedOutput"):
143
+ """Watch an output
144
+
145
+ :param watched: The watched output specification
146
+ """
147
+ logger.debug("Registering task output listener %s", watched)
148
+
149
+ # path = watched.job.tasks_output_path
150
+ TaskOutputs.create(watched.job).watch_output(watched)
151
+
152
+ def add(self, watcher, event):
153
+ asyncio.run_coroutine_threadsafe(
154
+ self.xp.update_task_output_count(1),
155
+ self.xp.scheduler.loop,
156
+ ).result()
157
+ self.queue.put((watcher, event))
158
+
159
+ def run(self):
160
+ logging.debug("Starting output listener queue")
161
+ while True:
162
+ # Get the next element in the queue
163
+ element = self.queue.get()
164
+ if element is None:
165
+ # end of processing
166
+ break
167
+
168
+ # Call all the listeners
169
+ logging.debug("Got one event: %s", element)
170
+ watcher, event = element
171
+ for listener in watcher.listeners:
172
+ try:
173
+ logger.debug("Calling listener [%s] with %s", listener, event)
174
+ listener(event)
175
+ logger.debug(
176
+ "[done] Calling listener [%s] with %s", listener, event
177
+ )
178
+ except Exception:
179
+ logging.exception("Exception while calling the listener")
180
+ self.queue.task_done()
181
+
182
+ asyncio.run_coroutine_threadsafe(
183
+ self.xp.update_task_output_count(-1), self.xp.scheduler.loop
184
+ ).result()
Binary file