experimaestro 2.0.0a3__py3-none-any.whl → 2.0.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

@@ -0,0 +1,475 @@
1
+ import asyncio
2
+ import time
3
+ from collections import ChainMap
4
+ import enum
5
+ from functools import cached_property
6
+ import itertools
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Iterator, List, Optional, Set
9
+
10
+ import concurrent
11
+
12
+ from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
13
+ from experimaestro.notifications import LevelInformation, Reporter
14
+
15
+ # from experimaestro.scheduler.base import Scheduler
16
+ from experimaestro.scheduler.dependencies import Dependency, DependencyStatus, Resource
17
+ from experimaestro.scheduler.workspace import RunMode, Workspace
18
+ from experimaestro.locking import Lock, LockError, Locks
19
+ from experimaestro.utils import logger
20
+
21
+ if TYPE_CHECKING:
22
+ from experimaestro.connectors import Process
23
+ from experimaestro.launchers import Launcher
24
+
25
+
26
+ class JobState(enum.Enum):
27
+ # Job is not yet scheduled
28
+ UNSCHEDULED = 0
29
+
30
+ # Job is waiting for dependencies to be done
31
+ WAITING = 1
32
+
33
+ # Job is ready to run
34
+ READY = 2
35
+
36
+ # Job is scheduled (e.g. slurm)
37
+ SCHEDULED = 3
38
+
39
+ # Job is running
40
+ RUNNING = 4
41
+
42
+ # Job is done (finished)
43
+ DONE = 5
44
+
45
+ # Job failed (finished)
46
+ ERROR = 6
47
+
48
+ def notstarted(self):
49
+ return self.value <= JobState.READY.value
50
+
51
+ def running(self):
52
+ return (
53
+ self.value == JobState.RUNNING.value
54
+ or self.value == JobState.SCHEDULED.value
55
+ )
56
+
57
+ def finished(self):
58
+ return self.value >= JobState.DONE.value
59
+
60
+
61
+ class JobFailureStatus(enum.Enum):
62
+ #: Job failed
63
+ DEPENDENCY = 0
64
+
65
+ #: Job dependency failed
66
+ FAILED = 1
67
+
68
+ #: Memory
69
+ MEMORY = 2
70
+
71
+
72
+ class JobLock(Lock):
73
+ def __init__(self, job):
74
+ super().__init__()
75
+ self.job = job
76
+
77
+ def _acquire(self):
78
+ return self.job.state == JobState.DONE
79
+
80
+ def _release(self):
81
+ return False
82
+
83
+
84
+ class JobDependency(Dependency):
85
+ def __init__(self, job):
86
+ super().__init__(job)
87
+
88
+ def status(self) -> DependencyStatus:
89
+ if self.origin.state == JobState.DONE:
90
+ return DependencyStatus.OK
91
+ elif self.origin.state == JobState.ERROR:
92
+ return DependencyStatus.FAIL
93
+ return DependencyStatus.WAIT
94
+
95
+ def lock(self):
96
+ return JobLock(self.origin)
97
+
98
+
99
+ class Job(Resource):
100
+ """A job is a resource that is produced by the execution of some code"""
101
+
102
+ # Set by the scheduler
103
+ _readyEvent: Optional[asyncio.Event]
104
+ _future: Optional["concurrent.futures.Future"]
105
+
106
+ def __init__(
107
+ self,
108
+ config: Config,
109
+ *,
110
+ workspace: Workspace = None,
111
+ launcher: "Launcher" = None,
112
+ run_mode: RunMode = RunMode.NORMAL,
113
+ ):
114
+ from experimaestro.scheduler.base import Scheduler
115
+
116
+ super().__init__()
117
+
118
+ self.workspace = workspace or Workspace.CURRENT
119
+ self.launcher = launcher or self.workspace.launcher if self.workspace else None
120
+
121
+ if run_mode == RunMode.NORMAL:
122
+ assert self.workspace is not None, "No experiment has been defined"
123
+ assert self.launcher is not None, (
124
+ "No launcher, and no default defined for the workspace %s" % workspace
125
+ )
126
+
127
+ self.type = config.__xpmtype__
128
+ self.name = str(self.type.identifier).rsplit(".", 1)[-1]
129
+
130
+ self.scheduler: Optional["Scheduler"] = None
131
+ self.config = config
132
+ self.state: JobState = JobState.UNSCHEDULED
133
+
134
+ #: If a job has failed, indicates the failure status
135
+ self.failure_status: JobFailureStatus = None
136
+
137
+ # Dependencies
138
+ self.dependencies: Set[Dependency] = set() # as target
139
+
140
+ # Watched outputs
141
+ self.watched_outputs = {}
142
+ for watched in config.__xpm__.watched_outputs:
143
+ self.watch_output(watched)
144
+
145
+ # Process
146
+ self._process = None
147
+ self.unsatisfied = 0
148
+
149
+ # Meta-information
150
+ self.starttime: Optional[float] = None
151
+ self.submittime: Optional[float] = None
152
+ self.endtime: Optional[float] = None
153
+ self._progress: List[LevelInformation] = []
154
+ self.tags = config.tags()
155
+
156
+ def watch_output(self, watched: "WatchedOutput"):
157
+ """Monitor task outputs
158
+
159
+ :param watched: A description of the watched output
160
+ """
161
+ self.scheduler.xp.watch_output(watched)
162
+
163
+ def task_output_update(self, subpath: Path):
164
+ """Notification of an updated task output"""
165
+ if watcher := self.watched_outputs.get(subpath, None):
166
+ watcher.update()
167
+
168
+ def done_handler(self):
169
+ """The task has been completed"""
170
+ for watcher in self.watched_outputs.values():
171
+ watcher.update()
172
+
173
+ def __str__(self):
174
+ return "Job[{}]".format(self.identifier)
175
+
176
+ def wait(self) -> JobState:
177
+ assert self._future, "Cannot wait a not submitted job"
178
+ return self._future.result()
179
+
180
+ @cached_property
181
+ def python_path(self) -> Iterator[str]:
182
+ """Returns an iterator over python path"""
183
+ return itertools.chain(self.workspace.python_path)
184
+
185
+ @cached_property
186
+ def environ(self):
187
+ """Returns the job environment
188
+
189
+ It is made of (by order of priority):
190
+
191
+ 1. The job environment
192
+ 1. The launcher environment
193
+ 1. The workspace environment
194
+
195
+ """
196
+ return ChainMap(
197
+ {},
198
+ self.launcher.environ if self.launcher else {},
199
+ self.workspace.env if self.workspace else {},
200
+ )
201
+
202
+ @property
203
+ def progress(self):
204
+ return self._progress
205
+
206
+ def set_progress(self, level: int, value: float, desc: Optional[str]):
207
+ if value < 0:
208
+ logger.warning(f"Progress value out of bounds ({value})")
209
+ value = 0
210
+ elif value > 1:
211
+ logger.warning(f"Progress value out of bounds ({value})")
212
+ value = 1
213
+
214
+ # Adjust the length of the array
215
+ self._progress = self._progress[: (level + 1)]
216
+ while len(self._progress) <= level:
217
+ self._progress.append(LevelInformation(len(self._progress), None, 0.0))
218
+
219
+ if desc:
220
+ self._progress[-1].desc = desc
221
+ self._progress[-1].progress = value
222
+
223
+ for listener in self.scheduler.listeners:
224
+ listener.job_state(self)
225
+
226
+ def add_notification_server(self, server):
227
+ """Adds a notification server"""
228
+ key, baseurl = server.getNotificationSpec()
229
+ dirpath = self.path / Reporter.NOTIFICATION_FOLDER
230
+ dirpath.mkdir(exist_ok=True)
231
+ (dirpath / key).write_text(f"{baseurl}/{self.identifier}")
232
+
233
+ @property
234
+ def ready(self):
235
+ return self.state == JobState.READY
236
+
237
+ @property
238
+ def jobpath(self) -> Path:
239
+ """Deprecated, use `path`"""
240
+ return self.workspace.jobspath / self.relpath
241
+
242
+ @property
243
+ def path(self) -> Path:
244
+ return self.workspace.jobspath / self.relpath
245
+
246
+ @property
247
+ def experimaestro_path(self) -> Path:
248
+ return (self.path / ".experimaestro").resolve()
249
+
250
+ @cached_property
251
+ def task_outputs_path(self) -> Path:
252
+ return self.experimaestro_path / "task-outputs.jsonl"
253
+
254
+ @property
255
+ def relpath(self):
256
+ identifier = self.config.__xpm__.identifier
257
+ base = Path(str(self.type.identifier))
258
+ return base / identifier.all.hex()
259
+
260
+ @property
261
+ def relmainpath(self):
262
+ identifier = self.config.__xpm__.identifier
263
+ base = Path(str(self.type.identifier))
264
+ return base / identifier.main.hex()
265
+
266
+ @property
267
+ def hashidentifier(self):
268
+ return self.config.__xpm__.identifier
269
+
270
+ @property
271
+ def identifier(self):
272
+ return self.config.__xpm__.identifier.all.hex()
273
+
274
+ def prepare(self, overwrite=False):
275
+ """Prepare all files before starting a task
276
+
277
+ :param overwrite: if True, overwrite files even if the task has been run
278
+ """
279
+ pass
280
+
281
+ async def aio_start(self, sched_dependency_lock, notification_server=None):
282
+ """Start the job with core job starting logic
283
+
284
+ This method contains the core logic for starting a job that was previously
285
+ located in Scheduler.aio_start(). It handles job locking, dependency
286
+ acquisition, directory setup, and job execution while using the scheduler's
287
+ coordination lock to prevent race conditions between multiple jobs.
288
+
289
+ :param sched_dependency_lock: The scheduler's dependency lock for coordination
290
+ between jobs to prevent race conditions during dependency acquisition
291
+ :param notification_server: Optional notification server from the experiment
292
+ for job progress reporting
293
+ :return: JobState.DONE if job completed successfully, JobState.ERROR if job
294
+ failed during execution, or None if dependencies couldn't be locked
295
+ (signals WAITING state to scheduler)
296
+ :raises Exception: Various exceptions during job execution, dependency locking,
297
+ or process creation
298
+ """
299
+ # We first lock the job before proceeding
300
+ assert self.launcher is not None
301
+
302
+ with Locks() as locks:
303
+ logger.debug("[starting] Locking job %s", self)
304
+ async with self.launcher.connector.lock(self.lockpath):
305
+ logger.debug("[starting] Locked job %s", self)
306
+
307
+ state = None
308
+ try:
309
+ logger.debug(
310
+ "Starting job %s with %d dependencies",
311
+ self,
312
+ len(self.dependencies),
313
+ )
314
+
315
+ # Individual dependency lock acquisition
316
+ # We use the scheduler-wide lock to avoid cross-jobs race conditions
317
+ async with sched_dependency_lock:
318
+ for dependency in self.dependencies:
319
+ try:
320
+ locks.append(dependency.lock().acquire())
321
+ except LockError:
322
+ logger.warning(
323
+ "Could not lock %s, aborting start for job %s",
324
+ dependency,
325
+ self,
326
+ )
327
+ dependency.check()
328
+ return None # Signal to scheduler that dependencies couldn't be locked
329
+
330
+ # Dependencies have been locked, we can start the job
331
+ self.starttime = time.time()
332
+
333
+ # Creates the main directory
334
+ directory = self.path
335
+ logger.debug("Making directories job %s...", directory)
336
+ if not directory.is_dir():
337
+ directory.mkdir(parents=True, exist_ok=True)
338
+
339
+ # Sets up the notification URL
340
+ if notification_server is not None:
341
+ self.add_notification_server(notification_server)
342
+
343
+ except Exception:
344
+ logger.warning("Error while locking job", exc_info=True)
345
+ return None # Signal waiting state to scheduler
346
+
347
+ try:
348
+ # Runs the job
349
+ process = await self.aio_run()
350
+ except Exception:
351
+ logger.warning("Error while starting job", exc_info=True)
352
+ return JobState.ERROR
353
+
354
+ try:
355
+ if isinstance(process, JobState):
356
+ state = process
357
+ logger.debug("Job %s ended (state %s)", self, state)
358
+ else:
359
+ logger.debug("Waiting for job %s process to end", self)
360
+
361
+ code = await process.aio_code()
362
+ logger.debug("Got return code %s for %s", code, self)
363
+
364
+ # Check the file if there is no return code
365
+ if code is None:
366
+ # Case where we cannot retrieve the code right away
367
+ if self.donepath.is_file():
368
+ code = 0
369
+ else:
370
+ code = int(self.failedpath.read_text())
371
+
372
+ logger.debug("Job %s ended with code %s", self, code)
373
+ state = JobState.DONE if code == 0 else JobState.ERROR
374
+
375
+ except JobError:
376
+ logger.warning("Error while running job")
377
+ state = JobState.ERROR
378
+
379
+ except Exception:
380
+ logger.warning(
381
+ "Error while running job (in experimaestro)", exc_info=True
382
+ )
383
+ state = JobState.ERROR
384
+ return state
385
+
386
+ async def aio_run(self):
387
+ """Actually run the code"""
388
+ raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
389
+
390
+ async def aio_process(self) -> Optional["Process"]:
391
+ """Returns the process if it exists"""
392
+ raise NotImplementedError("Not implemented")
393
+
394
+ @property
395
+ def pidpath(self):
396
+ """This file contains the file PID"""
397
+ return self.jobpath / ("%s.pid" % self.name)
398
+
399
+ @property
400
+ def lockpath(self):
401
+ """This file is used as a lock for running the job"""
402
+ return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
403
+
404
+ @property
405
+ def donepath(self) -> Path:
406
+ """When a job has been successful, this file is written"""
407
+ return self.jobpath / ("%s.done" % self.name)
408
+
409
+ @property
410
+ def failedpath(self):
411
+ """When a job has been unsuccessful, this file is written with an error
412
+ code inside"""
413
+ return self.jobpath / ("%s.failed" % self.name)
414
+
415
+ @property
416
+ def stdout(self) -> Path:
417
+ return self.jobpath / ("%s.out" % self.name)
418
+
419
+ @property
420
+ def stderr(self) -> Path:
421
+ return self.jobpath / ("%s.err" % self.name)
422
+
423
+ @property
424
+ def basepath(self) -> Path:
425
+ return self.jobpath / self.name
426
+
427
+ def dependencychanged(self, dependency, oldstatus, status):
428
+ """Called when a dependency has changed"""
429
+
430
+ def value(s):
431
+ return 1 if s == DependencyStatus.OK else 0
432
+
433
+ self.unsatisfied -= value(status) - value(oldstatus)
434
+
435
+ logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
436
+
437
+ if status == DependencyStatus.FAIL:
438
+ # Job completed
439
+ if not self.state.finished():
440
+ self.state = JobState.ERROR
441
+ self.failure_status = JobFailureStatus.DEPENDENCY
442
+ self._readyEvent.set()
443
+
444
+ if self.unsatisfied == 0:
445
+ logger.info("Job %s is ready to run", self)
446
+ # We are ready
447
+ self.state = JobState.READY
448
+ self._readyEvent.set()
449
+
450
+ def finalState(self) -> "concurrent.futures.Future[JobState]":
451
+ assert self._future is not None
452
+ return self._future
453
+
454
+
455
+ class JobContext(ConfigWalkContext):
456
+ def __init__(self, job: Job):
457
+ super().__init__()
458
+ self.job = job
459
+
460
+ @property
461
+ def name(self):
462
+ return self.job.name
463
+
464
+ @property
465
+ def path(self):
466
+ return self.job.path
467
+
468
+ @property
469
+ def task(self):
470
+ return self.job.config
471
+
472
+
473
+ class JobError(Exception):
474
+ def __init__(self, code):
475
+ super().__init__(f"Job exited with code {code}")
@@ -0,0 +1,32 @@
1
+ import signal
2
+ from typing import Set
3
+ from experimaestro.scheduler import experiment
4
+ from experimaestro.utils import logger
5
+
6
+
7
+ class SignalHandler:
8
+ def __init__(self):
9
+ self.experiments: Set["experiment"] = set()
10
+ self.original_sigint_handler = None
11
+
12
+ def add(self, xp: "experiment"):
13
+ if not self.experiments:
14
+ self.original_sigint_handler = signal.getsignal(signal.SIGINT)
15
+
16
+ signal.signal(signal.SIGINT, self)
17
+
18
+ self.experiments.add(xp)
19
+
20
+ def remove(self, xp):
21
+ self.experiments.remove(xp)
22
+ if not self.experiments:
23
+ signal.signal(signal.SIGINT, self.original_sigint_handler)
24
+
25
+ def __call__(self, signum, frame):
26
+ """SIGINT signal handler"""
27
+ logger.warning("Signal received")
28
+ for xp in self.experiments:
29
+ xp.stop()
30
+
31
+
32
+ SIGNAL_HANDLER = SignalHandler()
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Type
5
5
  from experimaestro import Task
6
6
 
7
7
  from experimaestro.core.context import SerializationContext
8
- from experimaestro.scheduler.base import Job, JobDependency
8
+ from experimaestro.scheduler.jobs import Job, JobDependency
9
9
  from experimaestro.settings import find_workspace
10
10
  from experimaestro.core.serialization import from_state_dict, save_definition
11
11
 
@@ -5,10 +5,25 @@ import platform
5
5
  import socket
6
6
  import uuid
7
7
  from experimaestro.scheduler.base import Job
8
- import pkg_resources
8
+ import sys
9
9
  import http
10
10
  import threading
11
11
  from typing import Optional, Tuple
12
+
13
+ if sys.version_info >= (3, 9):
14
+ from importlib.resources import files
15
+
16
+ pkg_resources = None
17
+ else:
18
+ try:
19
+ from importlib_resources import files
20
+
21
+ pkg_resources = None
22
+ except ImportError:
23
+ # Fallback to pkg_resources if importlib_resources not available
24
+ import pkg_resources
25
+
26
+ files = None
12
27
  from experimaestro.scheduler import Scheduler, Listener as BaseListener
13
28
  from experimaestro.scheduler.services import Service, ServiceListener
14
29
  from experimaestro.settings import ServerSettings
@@ -143,6 +158,7 @@ def proxy_response(base_url: str, request: Request, path: str):
143
158
  return flask_response
144
159
 
145
160
 
161
+ # flake8: noqa: C901
146
162
  def start_app(server: "Server"):
147
163
  logging.debug("Starting Flask server...")
148
164
  app = Flask("experimaestro")
@@ -256,10 +272,25 @@ def start_app(server: "Server"):
256
272
 
257
273
  datapath = "data/%s" % path
258
274
  logging.debug("Looking for %s", datapath)
259
- if pkg_resources.resource_exists("experimaestro.server", datapath):
260
- mimetype = MIMETYPES[datapath.rsplit(".", 1)[1]]
261
- content = pkg_resources.resource_string("experimaestro.server", datapath)
262
- return Response(content, mimetype=mimetype)
275
+
276
+ if files is not None:
277
+ try:
278
+ package_files = files("experimaestro.server")
279
+ resource_file = package_files / datapath
280
+ if resource_file.is_file():
281
+ mimetype = MIMETYPES[datapath.rsplit(".", 1)[1]]
282
+ content = resource_file.read_bytes()
283
+ return Response(content, mimetype=mimetype)
284
+ except (FileNotFoundError, KeyError):
285
+ pass
286
+ elif pkg_resources is not None:
287
+ # Fallback to pkg_resources
288
+ if pkg_resources.resource_exists("experimaestro.server", datapath):
289
+ mimetype = MIMETYPES[datapath.rsplit(".", 1)[1]]
290
+ content = pkg_resources.resource_string(
291
+ "experimaestro.server", datapath
292
+ )
293
+ return Response(content, mimetype=mimetype)
263
294
  return Response("Page not found", status=404)
264
295
 
265
296
  # Start the app
@@ -1,7 +1,7 @@
1
1
  from typing import Any, Callable
2
2
  import pytest
3
3
  from experimaestro import Config, Param, Task, RunMode
4
- from experimaestro.scheduler.base import JobDependency
4
+ from experimaestro.scheduler.jobs import JobDependency
5
5
  from experimaestro.tests.utils import TemporaryExperiment
6
6
 
7
7
 
@@ -13,12 +13,35 @@ class Learner(Task):
13
13
  validation: Param[Validation]
14
14
  x: Param[int]
15
15
 
16
+ @staticmethod
17
+ def create(x: int, validation: Param[Validation]):
18
+ return Learner.C(x=x, validation=validation)
19
+
20
+
21
+ class LearnerList(Task):
22
+ validation: Param[list[Validation]]
23
+ x: Param[int]
24
+
25
+ @staticmethod
26
+ def create(x: int, validation: Param[Validation]):
27
+ return LearnerList.C(x=x, validation=[validation])
28
+
29
+
30
+ class LearnerDict(Task):
31
+ validation: Param[dict[str, Validation]]
32
+ x: Param[int]
33
+
34
+ @staticmethod
35
+ def create(x: int, validation: Param[Validation]):
36
+ return LearnerDict.C(x=x, validation={"key": validation})
37
+
16
38
 
17
39
  class ModuleLoader(Task):
18
40
  validation: Param[Validation] = field(ignore_generated=True)
19
41
 
20
42
 
21
- def test_generators_reuse_on_submit():
43
+ @pytest.mark.parametrize("cls", [Learner, LearnerDict, LearnerList])
44
+ def test_generators_reuse_on_submit(cls):
22
45
  # We have one way to select the best model
23
46
  validation = Validation.C()
24
47
 
@@ -29,40 +52,42 @@ def test_generators_reuse_on_submit():
29
52
  )
30
53
 
31
54
  # OK, the path is generated depending on Learner with x=1
32
- Learner.C(x=1, validation=validation).submit(workspace=workspace)
55
+ cls.create(1, validation).submit(workspace=workspace)
33
56
 
34
57
  with pytest.raises((AttributeError)):
35
58
  # Here we have a problem...
36
59
  # the path is still the previous one
37
- Learner.C(x=2, validation=validation).submit(workspace=workspace)
60
+ cls.create(2, validation).submit(workspace=workspace)
38
61
 
39
62
 
40
- def test_generators_delayed_submit():
63
+ @pytest.mark.parametrize("cls", [Learner, LearnerDict, LearnerList])
64
+ def test_generators_delayed_submit(cls):
41
65
  workspace = Workspace(
42
66
  Settings(),
43
67
  WorkspaceSettings("test_generators_simple", path=Path("/tmp")),
44
68
  run_mode=RunMode.DRY_RUN,
45
69
  )
46
70
  validation = Validation.C()
47
- task1 = Learner.C(x=1, validation=validation)
48
- task2 = Learner.C(x=2, validation=validation)
71
+ task1 = cls.create(1, validation)
72
+ task2 = cls.create(2, validation)
49
73
  task1.submit(workspace=workspace)
50
74
  with pytest.raises((AttributeError)):
51
75
  task2.submit(workspace=workspace)
52
76
 
53
77
 
54
- def test_generators_reuse_on_set():
78
+ @pytest.mark.parametrize("cls", [Learner, LearnerDict, LearnerList])
79
+ def test_generators_reuse_on_set(cls):
55
80
  workspace = Workspace(
56
81
  Settings(),
57
82
  WorkspaceSettings("test_generators_simple", path=Path("/tmp")),
58
83
  run_mode=RunMode.DRY_RUN,
59
84
  )
60
85
  validation = Validation.C()
61
- Learner.C(x=1, validation=validation).submit(workspace=workspace)
86
+ cls.create(1, validation).submit(workspace=workspace)
62
87
  with pytest.raises((AttributeError)):
63
88
  # We should not be able to *create* a second task with the same validation,
64
89
  # even without submitting it
65
- Learner.C(x=2, validation=validation)
90
+ cls.create(2, validation)
66
91
 
67
92
  # This should run OK
68
93
  ModuleLoader.C(validation=validation)