experimaestro 2.0.0a3__py3-none-any.whl → 2.0.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

@@ -0,0 +1,387 @@
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from shutil import rmtree
6
+ from typing import Any, Dict, Optional, TypeVar, Union
7
+
8
+ from experimaestro.core.objects import WatchedOutput
9
+ from experimaestro.exceptions import HandledException
10
+
11
+ from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
12
+ from experimaestro.scheduler.jobs import Job, JobFailureStatus
13
+ from experimaestro.scheduler.services import Service
14
+ from experimaestro.scheduler.workspace import RunMode, Workspace
15
+ from experimaestro.settings import WorkspaceSettings, get_settings
16
+ from experimaestro.utils import logger
17
+
18
+ ServiceClass = TypeVar("ServiceClass", bound=Service)
19
+
20
+
21
+ class FailedExperiment(HandledException):
22
+ """Raised when an experiment failed"""
23
+
24
+ pass
25
+
26
+
27
+ class experiment:
28
+ """Main experiment object
29
+
30
+ It is a context object, i.e. an experiment is run with
31
+
32
+ ```py
33
+ with experiment(...) as xp:
34
+ ...
35
+ ```
36
+ """
37
+
38
+ #: Current experiment
39
+ CURRENT: Optional["experiment"] = None
40
+
41
+ @staticmethod
42
+ def current() -> "experiment":
43
+ """Returns the current experiment, but checking first if set
44
+
45
+ If there is no current experiment, raises an AssertError
46
+ """
47
+ assert experiment.CURRENT is not None, "No current experiment defined"
48
+ return experiment.CURRENT
49
+
50
+ def __init__(
51
+ self,
52
+ env: Union[Path, str, WorkspaceSettings],
53
+ name: str,
54
+ *,
55
+ host: Optional[str] = None,
56
+ port: Optional[int] = None,
57
+ token: Optional[str] = None,
58
+ run_mode: Optional[RunMode] = None,
59
+ launcher=None,
60
+ ):
61
+ """
62
+ :param env: an environment -- or a working directory for a local
63
+ environment
64
+
65
+ :param name: the identifier of the experiment
66
+
67
+ :param launcher: The launcher (if not provided, inferred from path)
68
+
69
+ :param host: The host for the web server (overrides the environment if
70
+ set)
71
+ :param port: the port for the web server (overrides the environment if
72
+ set). Use negative number to avoid running a web server (default when dry run).
73
+
74
+ :param run_mode: The run mode for the experiment (normal, generate run
75
+ files, dry run)
76
+ """
77
+
78
+ from experimaestro.server import Server
79
+ from experimaestro.scheduler import Listener, Scheduler
80
+
81
+ settings = get_settings()
82
+ if not isinstance(env, WorkspaceSettings):
83
+ env = WorkspaceSettings(id=None, path=Path(env))
84
+
85
+ # Creates the workspace
86
+ run_mode = run_mode or RunMode.NORMAL
87
+ self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
88
+
89
+ # Mark the directory has an experimaestro folder
90
+ self.workdir = self.workspace.experimentspath / name
91
+ self.workdir.mkdir(parents=True, exist_ok=True)
92
+ self.xplockpath = self.workdir / "lock"
93
+ self.xplock = None
94
+ self.old_experiment = None
95
+ self.services: Dict[str, Service] = {}
96
+ self._job_listener: Optional[Listener] = None
97
+
98
+ # Get configuration settings
99
+
100
+ if host is not None:
101
+ settings.server.host = host
102
+
103
+ if port is not None:
104
+ settings.server.port = port
105
+
106
+ if token is not None:
107
+ settings.server.token = token
108
+
109
+ # Create the scheduler
110
+ self.scheduler = Scheduler.create(self, name)
111
+ self.server = (
112
+ Server(self.scheduler, settings.server)
113
+ if (settings.server.port is not None and settings.server.port >= 0)
114
+ and self.workspace.run_mode == RunMode.NORMAL
115
+ else None
116
+ )
117
+
118
+ if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
119
+ import faulthandler
120
+
121
+ logger.info("Enabling fault handler")
122
+ faulthandler.enable(all_threads=True)
123
+
124
+ def submit(self, job: Job):
125
+ return self.scheduler.submit(job)
126
+
127
+ def prepare(self, job: Job):
128
+ """Generate the file"""
129
+ return self.scheduler.prepare(job)
130
+
131
+ @property
132
+ def run_mode(self):
133
+ return self.workspace.run_mode
134
+
135
+ @property
136
+ def loop(self):
137
+ assert self.scheduler is not None, "No scheduler defined"
138
+ return self.scheduler.loop
139
+
140
+ @property
141
+ def resultspath(self):
142
+ """Return the directory in which results can be stored for this experiment"""
143
+ return self.workdir / "results"
144
+
145
+ @property
146
+ def jobspath(self):
147
+ """Return the directory in which results can be stored for this experiment"""
148
+ return self.workdir / "jobs"
149
+
150
+ @property
151
+ def alt_jobspaths(self):
152
+ """Return potential other directories"""
153
+ for alt_workdir in self.workspace.alt_workdirs:
154
+ yield alt_workdir / "jobs"
155
+
156
+ @property
157
+ def jobsbakpath(self):
158
+ """Return the directory in which results can be stored for this experiment"""
159
+ return self.workdir / "jobs.bak"
160
+
161
+ def stop(self):
162
+ """Stop the experiment as soon as possible"""
163
+
164
+ async def doStop():
165
+ assert self.scheduler is not None
166
+ async with self.scheduler.exitCondition:
167
+ self.exitMode = True
168
+ logging.debug("Setting exit mode to true")
169
+ self.scheduler.exitCondition.notify_all()
170
+
171
+ assert self.scheduler is not None and self.scheduler.loop is not None
172
+ asyncio.run_coroutine_threadsafe(doStop(), self.scheduler.loop)
173
+
174
+ def wait(self):
175
+ """Wait until the running processes have finished"""
176
+
177
+ async def awaitcompletion():
178
+ assert self.scheduler is not None, "No scheduler defined"
179
+ logger.debug("Waiting to exit scheduler...")
180
+ async with self.scheduler.exitCondition:
181
+ while True:
182
+ if self.exitMode:
183
+ break
184
+
185
+ # If we have still unfinished jobs or possible new tasks, wait
186
+ logger.debug(
187
+ "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
188
+ self.unfinishedJobs,
189
+ self.taskOutputQueueSize,
190
+ )
191
+ if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
192
+ break
193
+
194
+ # Wait for more news...
195
+ await self.scheduler.exitCondition.wait()
196
+
197
+ if self.failedJobs:
198
+ # Show some more information
199
+ count = 0
200
+ for job in self.failedJobs.values():
201
+ if job.failure_status != JobFailureStatus.DEPENDENCY:
202
+ count += 1
203
+ logger.error(
204
+ "Job %s failed, check the log file %s",
205
+ job.relpath,
206
+ job.stderr,
207
+ )
208
+ raise FailedExperiment(f"{count} failed jobs")
209
+
210
+ future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
211
+ return future.result()
212
+
213
+ def setenv(self, name, value, override=True):
214
+ """Shortcut to set the environment value"""
215
+ if override or name not in self.workspace.env:
216
+ logging.info("Setting environment: %s=%s", name, value)
217
+ self.workspace.env[name] = value
218
+
219
+ def token(self, name: str, count: int):
220
+ """Returns a token for this experiment
221
+
222
+ The token is the default token of the workspace connector"""
223
+ return self.workspace.connector.createtoken(name, count)
224
+
225
+ def __enter__(self):
226
+ from .dynamic_outputs import TaskOutputsWorker
227
+
228
+ if self.workspace.run_mode != RunMode.DRY_RUN:
229
+ logger.info("Locking experiment %s", self.xplockpath)
230
+ self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
231
+ logger.info("Experiment locked")
232
+
233
+ # Move old jobs into "jobs.bak"
234
+ if self.workspace.run_mode == RunMode.NORMAL:
235
+ self.jobsbakpath.mkdir(exist_ok=True)
236
+ for p in self.jobspath.glob("*/*"):
237
+ if p.is_symlink():
238
+ target = self.jobsbakpath / p.relative_to(self.jobspath)
239
+ if target.is_symlink():
240
+ # Remove if duplicate
241
+ p.unlink()
242
+ else:
243
+ # Rename otherwise
244
+ target.parent.mkdir(parents=True, exist_ok=True)
245
+ p.rename(target)
246
+
247
+ if self.server:
248
+ self.server.start()
249
+
250
+ self.workspace.__enter__()
251
+ (self.workspace.path / ".__experimaestro__").touch()
252
+
253
+ # Number of unfinished jobs
254
+ self.unfinishedJobs = 0
255
+ self.taskOutputQueueSize = 0
256
+
257
+ # List of failed jobs
258
+ self.failedJobs: Dict[str, Job] = {}
259
+
260
+ # Exit mode when catching signals
261
+ self.exitMode = False
262
+
263
+ self.scheduler.start_scheduler()
264
+ self.taskOutputsWorker = TaskOutputsWorker(self)
265
+ self.taskOutputsWorker.start()
266
+
267
+ SIGNAL_HANDLER.add(self)
268
+
269
+ self.old_experiment = experiment.CURRENT
270
+ experiment.CURRENT = self
271
+ return self
272
+
273
+ def __exit__(self, exc_type, exc_value, traceback):
274
+ logger.debug("Exiting scheduler context")
275
+ # If no exception and normal run mode, remove old "jobs"
276
+ if self.workspace.run_mode == RunMode.NORMAL:
277
+ if exc_type is None and self.jobsbakpath.is_dir():
278
+ rmtree(self.jobsbakpath)
279
+
280
+ # Close the different locks
281
+ try:
282
+ if exc_type:
283
+ # import faulthandler
284
+ # faulthandler.dump_traceback()
285
+ logger.error(
286
+ "Not waiting since an exception was thrown"
287
+ " (some jobs may be running)"
288
+ )
289
+ else:
290
+ self.wait()
291
+ finally:
292
+ SIGNAL_HANDLER.remove(self)
293
+
294
+ # Stop services
295
+ for service in self.services.values():
296
+ logger.info("Closing service %s", service.description())
297
+ service.stop()
298
+
299
+ if self.scheduler is not None:
300
+ logger.info("Stopping scheduler event loop")
301
+ self.scheduler.loop.stop()
302
+
303
+ if self.taskOutputsWorker is not None:
304
+ logger.info("Stopping tasks outputs worker")
305
+ self.taskOutputsWorker.queue.put(None)
306
+
307
+ self.workspace.__exit__(exc_type, exc_value, traceback)
308
+ if self.xplock:
309
+ self.xplock.__exit__(exc_type, exc_value, traceback)
310
+
311
+ # Put back old experiment as current one
312
+ experiment.CURRENT = self.old_experiment
313
+ if self.server:
314
+ logger.info("Stopping web server")
315
+ self.server.stop()
316
+
317
+ if self.workspace.run_mode == RunMode.NORMAL:
318
+ # Write the state
319
+ logging.info("Saving the experiment state")
320
+ from experimaestro.scheduler.state import ExperimentState
321
+
322
+ ExperimentState.save(
323
+ self.workdir / "state.json", self.scheduler.jobs.values()
324
+ )
325
+
326
+ async def update_task_output_count(self, delta: int):
327
+ """Change in the number of task outputs to process"""
328
+ async with self.scheduler.exitCondition:
329
+ self.taskOutputQueueSize += delta
330
+ logging.debug(
331
+ "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
332
+ )
333
+ if self.taskOutputQueueSize == 0:
334
+ self.scheduler.exitCondition.notify_all()
335
+
336
+ def watch_output(self, watched: "WatchedOutput"):
337
+ """Watch an output
338
+
339
+ :param watched: The watched output specification
340
+ """
341
+
342
+ self.taskOutputsWorker.watch_output(watched)
343
+
344
+ def add_service(self, service: ServiceClass) -> ServiceClass:
345
+ """Adds a service (e.g. tensorboard viewer) to the experiment
346
+
347
+ :param service: A service instance
348
+ :return: The same service instance
349
+ """
350
+ self.services[service.id] = service
351
+ for listener in self.scheduler.listeners:
352
+ listener.service_add(service)
353
+ return service
354
+
355
+ def save(self, obj: Any, name: str = "default"):
356
+ """Serializes configurations.
357
+
358
+ Saves configuration objects within the experimental directory
359
+
360
+ :param obj: The object to save
361
+ :param name: The name of the saving directory (default to `default`)
362
+ """
363
+
364
+ if self.workspace.run_mode == RunMode.NORMAL:
365
+ from experimaestro import save
366
+
367
+ save_dir = self.workdir / "data" / name
368
+ save_dir.mkdir(exist_ok=True, parents=True)
369
+
370
+ save(obj, save_dir)
371
+
372
+ def load(self, reference: str, name: str = "default"):
373
+ """Serializes configurations.
374
+
375
+ Loads configuration objects from an experimental directory
376
+
377
+ :param reference: The name of the experiment
378
+ :param name: The name of the saving directory (default to `default`)
379
+ """
380
+ from experimaestro import load
381
+
382
+ path = self.workspace.experimentspath / reference / "data" / name
383
+ return load(path)
384
+
385
+
386
+ # re-export at the module level
387
+ current = experiment.current