experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +140 -16
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/progress.py +269 -0
  7. experimaestro/cli/refactor.py +249 -0
  8. experimaestro/click.py +0 -1
  9. experimaestro/commandline.py +19 -3
  10. experimaestro/connectors/__init__.py +22 -3
  11. experimaestro/connectors/local.py +12 -0
  12. experimaestro/core/arguments.py +192 -37
  13. experimaestro/core/identifier.py +127 -12
  14. experimaestro/core/objects/__init__.py +6 -0
  15. experimaestro/core/objects/config.py +702 -285
  16. experimaestro/core/objects/config_walk.py +24 -6
  17. experimaestro/core/serialization.py +91 -34
  18. experimaestro/core/serializers.py +1 -8
  19. experimaestro/core/subparameters.py +164 -0
  20. experimaestro/core/types.py +198 -83
  21. experimaestro/exceptions.py +26 -0
  22. experimaestro/experiments/cli.py +107 -25
  23. experimaestro/generators.py +50 -9
  24. experimaestro/huggingface.py +3 -1
  25. experimaestro/launcherfinder/parser.py +29 -0
  26. experimaestro/launcherfinder/registry.py +3 -3
  27. experimaestro/launchers/__init__.py +26 -1
  28. experimaestro/launchers/direct.py +12 -0
  29. experimaestro/launchers/slurm/base.py +154 -2
  30. experimaestro/mkdocs/base.py +6 -8
  31. experimaestro/mkdocs/metaloader.py +0 -1
  32. experimaestro/mypy.py +452 -7
  33. experimaestro/notifications.py +75 -16
  34. experimaestro/progress.py +404 -0
  35. experimaestro/rpyc.py +0 -1
  36. experimaestro/run.py +19 -6
  37. experimaestro/scheduler/__init__.py +18 -1
  38. experimaestro/scheduler/base.py +504 -959
  39. experimaestro/scheduler/dependencies.py +43 -28
  40. experimaestro/scheduler/dynamic_outputs.py +259 -130
  41. experimaestro/scheduler/experiment.py +582 -0
  42. experimaestro/scheduler/interfaces.py +474 -0
  43. experimaestro/scheduler/jobs.py +485 -0
  44. experimaestro/scheduler/services.py +186 -12
  45. experimaestro/scheduler/signal_handler.py +32 -0
  46. experimaestro/scheduler/state.py +1 -1
  47. experimaestro/scheduler/state_db.py +388 -0
  48. experimaestro/scheduler/state_provider.py +2345 -0
  49. experimaestro/scheduler/state_sync.py +834 -0
  50. experimaestro/scheduler/workspace.py +52 -10
  51. experimaestro/scriptbuilder.py +7 -0
  52. experimaestro/server/__init__.py +153 -32
  53. experimaestro/server/data/index.css +0 -125
  54. experimaestro/server/data/index.css.map +1 -1
  55. experimaestro/server/data/index.js +194 -58
  56. experimaestro/server/data/index.js.map +1 -1
  57. experimaestro/settings.py +47 -6
  58. experimaestro/sphinx/__init__.py +3 -3
  59. experimaestro/taskglobals.py +20 -0
  60. experimaestro/tests/conftest.py +80 -0
  61. experimaestro/tests/core/test_generics.py +2 -2
  62. experimaestro/tests/identifier_stability.json +45 -0
  63. experimaestro/tests/launchers/bin/sacct +6 -2
  64. experimaestro/tests/launchers/bin/sbatch +4 -2
  65. experimaestro/tests/launchers/common.py +2 -2
  66. experimaestro/tests/launchers/test_slurm.py +80 -0
  67. experimaestro/tests/restart.py +1 -1
  68. experimaestro/tests/tasks/all.py +7 -0
  69. experimaestro/tests/tasks/test_dynamic.py +231 -0
  70. experimaestro/tests/test_checkers.py +2 -2
  71. experimaestro/tests/test_cli_jobs.py +615 -0
  72. experimaestro/tests/test_dependencies.py +11 -17
  73. experimaestro/tests/test_deprecated.py +630 -0
  74. experimaestro/tests/test_environment.py +200 -0
  75. experimaestro/tests/test_experiment.py +3 -3
  76. experimaestro/tests/test_file_progress.py +425 -0
  77. experimaestro/tests/test_file_progress_integration.py +477 -0
  78. experimaestro/tests/test_forward.py +3 -3
  79. experimaestro/tests/test_generators.py +93 -0
  80. experimaestro/tests/test_identifier.py +520 -169
  81. experimaestro/tests/test_identifier_stability.py +458 -0
  82. experimaestro/tests/test_instance.py +16 -21
  83. experimaestro/tests/test_multitoken.py +442 -0
  84. experimaestro/tests/test_mypy.py +433 -0
  85. experimaestro/tests/test_objects.py +314 -30
  86. experimaestro/tests/test_outputs.py +8 -8
  87. experimaestro/tests/test_param.py +22 -26
  88. experimaestro/tests/test_partial_paths.py +231 -0
  89. experimaestro/tests/test_progress.py +2 -50
  90. experimaestro/tests/test_resumable_task.py +480 -0
  91. experimaestro/tests/test_serializers.py +141 -60
  92. experimaestro/tests/test_state_db.py +434 -0
  93. experimaestro/tests/test_subparameters.py +160 -0
  94. experimaestro/tests/test_tags.py +151 -15
  95. experimaestro/tests/test_tasks.py +137 -160
  96. experimaestro/tests/test_token_locking.py +252 -0
  97. experimaestro/tests/test_tokens.py +25 -19
  98. experimaestro/tests/test_types.py +133 -11
  99. experimaestro/tests/test_validation.py +19 -19
  100. experimaestro/tests/test_workspace_triggers.py +158 -0
  101. experimaestro/tests/token_reschedule.py +5 -3
  102. experimaestro/tests/utils.py +2 -2
  103. experimaestro/tokens.py +154 -57
  104. experimaestro/tools/diff.py +8 -1
  105. experimaestro/tui/__init__.py +8 -0
  106. experimaestro/tui/app.py +2303 -0
  107. experimaestro/tui/app.tcss +353 -0
  108. experimaestro/tui/log_viewer.py +228 -0
  109. experimaestro/typingutils.py +11 -2
  110. experimaestro/utils/__init__.py +23 -0
  111. experimaestro/utils/environment.py +148 -0
  112. experimaestro/utils/git.py +129 -0
  113. experimaestro/utils/resources.py +1 -1
  114. experimaestro/version.py +34 -0
  115. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
  116. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  117. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  118. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  119. experimaestro/compat.py +0 -6
  120. experimaestro/core/objects.pyi +0 -225
  121. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  122. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  123. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  124. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  125. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  126. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  127. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  128. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  129. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  130. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  131. experimaestro-1.11.1.dist-info/RECORD +0 -158
  132. experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
  133. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,582 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ import time
7
+ from shutil import rmtree
8
+ from typing import Any, Dict, Optional, TypeVar, Union
9
+
10
+ from experimaestro.core.objects import WatchedOutput
11
+ from experimaestro.exceptions import HandledException
12
+
13
+ from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
14
+ from experimaestro.scheduler.jobs import Job
15
+ from experimaestro.scheduler.services import Service
16
+ from experimaestro.scheduler.workspace import RunMode, Workspace
17
+ from experimaestro.settings import WorkspaceSettings, get_settings
18
+ from experimaestro.utils import logger
19
+
20
+ ServiceClass = TypeVar("ServiceClass", bound=Service)
21
+
22
+
23
+ class FailedExperiment(HandledException):
24
+ """Raised when an experiment failed"""
25
+
26
+ pass
27
+
28
+
29
+ class DatabaseListener:
30
+ """Listener that updates job state in the database"""
31
+
32
+ def __init__(self, state_provider, experiment_id: str, run_id: str):
33
+ self.state_provider = state_provider
34
+ self.experiment_id = experiment_id
35
+ self.run_id = run_id
36
+
37
+ def job_submitted(self, job):
38
+ # Already handled in experiment.add_job()
39
+ pass
40
+
41
+ def job_state(self, job):
42
+ """Update job state in database"""
43
+ self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
44
+
45
+ def service_add(self, service):
46
+ """Update service in database"""
47
+ self.state_provider.update_service(
48
+ service.id,
49
+ self.experiment_id,
50
+ self.run_id,
51
+ service.description(),
52
+ service.state.name,
53
+ state_dict=json.dumps(service.state_dict()),
54
+ )
55
+
56
+ def service_state_changed(self, service):
57
+ """Update service state in database (called by Service when state changes)"""
58
+ self.state_provider.update_service(
59
+ service.id,
60
+ self.experiment_id,
61
+ self.run_id,
62
+ service.description(),
63
+ service.state.name,
64
+ state_dict=json.dumps(service.state_dict()),
65
+ )
66
+
67
+
68
+ class experiment:
69
+ """Context manager for running experiments.
70
+
71
+ Creates a workspace, manages task submission, and optionally starts
72
+ a web server for monitoring.
73
+
74
+ Example::
75
+
76
+ from experimaestro import experiment
77
+
78
+ with experiment("./workdir", "my-experiment", port=12345) as xp:
79
+ task = MyTask.C(param=42).submit()
80
+ result = task.wait()
81
+ """
82
+
83
+ #: Current experiment
84
+ CURRENT: Optional["experiment"] = None
85
+
86
+ @staticmethod
87
+ def current() -> "experiment":
88
+ """Returns the current experiment, but checking first if set
89
+
90
+ If there is no current experiment, raises an AssertError
91
+ """
92
+ assert experiment.CURRENT is not None, "No current experiment defined"
93
+ return experiment.CURRENT
94
+
95
+ def __init__(
96
+ self,
97
+ env: Union[Path, str, WorkspaceSettings],
98
+ name: str,
99
+ *,
100
+ host: Optional[str] = None,
101
+ port: Optional[int] = None,
102
+ token: Optional[str] = None,
103
+ run_mode: Optional[RunMode] = None,
104
+ launcher=None,
105
+ register_signals: bool = True,
106
+ ):
107
+ """
108
+ :param env: an environment -- or a working directory for a local
109
+ environment
110
+
111
+ :param name: the identifier of the experiment
112
+
113
+ :param launcher: The launcher (if not provided, inferred from path)
114
+
115
+ :param host: The host for the web server (overrides the environment if
116
+ set)
117
+ :param port: the port for the web server (overrides the environment if
118
+ set). Use negative number to avoid running a web server (default when dry run).
119
+
120
+ :param run_mode: The run mode for the experiment (normal, generate run
121
+ files, dry run)
122
+
123
+ :param register_signals: Whether to register signal handlers (default: True).
124
+ Set to False when running in a background thread.
125
+ """
126
+
127
+ from experimaestro.scheduler import Listener, Scheduler
128
+
129
+ settings = get_settings()
130
+ if not isinstance(env, WorkspaceSettings):
131
+ env = WorkspaceSettings(id=None, path=Path(env))
132
+
133
+ # Creates the workspace
134
+ run_mode = run_mode or RunMode.NORMAL
135
+ self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
136
+
137
+ # Mark the directory has an experimaestro folder
138
+ self.workdir = self.workspace.experimentspath / name
139
+ self.workdir.mkdir(parents=True, exist_ok=True)
140
+ self.xplockpath = self.workdir / "lock"
141
+ self.xplock = None
142
+ self.old_experiment = None
143
+ self.services: Dict[str, Service] = {}
144
+ self._job_listener: Optional[Listener] = None
145
+ self._register_signals = register_signals
146
+
147
+ # Get configuration settings
148
+
149
+ if host is not None:
150
+ settings.server.host = host
151
+
152
+ if port is not None:
153
+ settings.server.port = port
154
+
155
+ if token is not None:
156
+ settings.server.token = token
157
+
158
+ # Use singleton scheduler
159
+ self.scheduler = Scheduler.instance()
160
+
161
+ # Determine if we need a server
162
+ self._needs_server = (
163
+ settings.server.port is not None and settings.server.port >= 0
164
+ ) and self.workspace.run_mode == RunMode.NORMAL
165
+ self._server_settings = settings.server if self._needs_server else None
166
+
167
+ if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
168
+ import faulthandler
169
+
170
+ logger.info("Enabling fault handler")
171
+ faulthandler.enable(all_threads=True)
172
+
173
+ def submit(self, job: Job):
174
+ return self.scheduler.submit(job)
175
+
176
+ def prepare(self, job: Job):
177
+ """Generate the file"""
178
+ return self.scheduler.prepare(job)
179
+
180
+ @property
181
+ def run_mode(self):
182
+ return self.workspace.run_mode
183
+
184
+ @property
185
+ def loop(self):
186
+ assert self.scheduler is not None, "No scheduler defined"
187
+ return self.scheduler.loop
188
+
189
+ @property
190
+ def server(self):
191
+ """Access the server via the scheduler"""
192
+ return self.scheduler.server if self.scheduler else None
193
+
194
+ @property
195
+ def resultspath(self):
196
+ """Return the directory in which results can be stored for this experiment"""
197
+ return self.workdir / "results"
198
+
199
+ @property
200
+ def jobspath(self):
201
+ """Return the directory in which results can be stored for this experiment"""
202
+ return self.workdir / "jobs"
203
+
204
+ @property
205
+ def alt_jobspaths(self):
206
+ """Return potential other directories"""
207
+ for alt_workdir in self.workspace.alt_workdirs:
208
+ yield alt_workdir / "jobs"
209
+
210
+ @property
211
+ def jobsbakpath(self):
212
+ """Return the directory in which results can be stored for this experiment"""
213
+ return self.workdir / "jobs.bak"
214
+
215
+ @property
216
+ def jobs_jsonl_path(self):
217
+ """Return the path to the jobs.jsonl file for this experiment"""
218
+ return self.workdir / "jobs.jsonl"
219
+
220
+ @property
221
+ def services_json_path(self):
222
+ """Return the path to the services.json file for this experiment"""
223
+ return self.workdir / "services.json"
224
+
225
+ def _write_services_json(self):
226
+ """Write all services to services.json file"""
227
+ services_data = {}
228
+ for service_id, service in self.services.items():
229
+ # Get state_dict from service (includes __class__ for recreation)
230
+ service_state = service.state_dict()
231
+ # Add runtime state info
232
+ service_state.update(
233
+ {
234
+ "service_id": service_id,
235
+ "description": service.description(),
236
+ "state": service.state.name,
237
+ "url": getattr(service, "url", None),
238
+ "timestamp": time.time(),
239
+ }
240
+ )
241
+ services_data[service_id] = service_state
242
+
243
+ with self.services_json_path.open("w") as f:
244
+ json.dump(services_data, f, indent=2)
245
+
246
+ def add_job(self, job: "Job"):
247
+ """Register a job and its tags to jobs.jsonl file and database
248
+
249
+ Note: For NEW jobs, the unfinishedJobs counter is updated by
250
+ job.set_state() when the state transitions from UNSCHEDULED.
251
+ For jobs already running, we increment here since no state
252
+ transition will occur.
253
+ """
254
+ from experimaestro.scheduler.interfaces import JobState
255
+
256
+ if self in job.experiments:
257
+ # Do not double register
258
+ return
259
+
260
+ # Track which experiments this job belongs to
261
+ job.experiments.append(self)
262
+
263
+ # If job is already being tracked (not UNSCHEDULED and not finished),
264
+ # increment unfinishedJobs since no state transition will trigger it
265
+ if job.state != JobState.UNSCHEDULED and not job.state.finished():
266
+ self.unfinishedJobs += 1
267
+ logging.debug(
268
+ "Job %s already running, unfinished jobs for %s: %d",
269
+ job.identifier[:8],
270
+ self.workdir.name,
271
+ self.unfinishedJobs,
272
+ )
273
+
274
+ record = {
275
+ "job_id": job.identifier,
276
+ "task_id": str(job.type.identifier),
277
+ "tags": dict(job.tags.items()) if job.tags else {},
278
+ "timestamp": time.time(),
279
+ }
280
+
281
+ with self.jobs_jsonl_path.open("a") as f:
282
+ f.write(json.dumps(record) + "\n")
283
+
284
+ # Also register in database for TUI/monitoring
285
+ experiment_id = self.workdir.name
286
+ self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
287
+
288
+ def stop(self):
289
+ """Stop the experiment as soon as possible"""
290
+
291
+ async def doStop():
292
+ assert self.scheduler is not None
293
+ async with self.scheduler.exitCondition:
294
+ self.exitMode = True
295
+ logging.debug("Setting exit mode to true")
296
+ self.scheduler.exitCondition.notify_all()
297
+
298
+ assert self.scheduler is not None and self.scheduler.loop is not None
299
+ asyncio.run_coroutine_threadsafe(doStop(), self.scheduler.loop)
300
+
301
+ def wait(self):
302
+ """Wait until the running processes have finished"""
303
+
304
+ async def awaitcompletion():
305
+ assert self.scheduler is not None, "No scheduler defined"
306
+ logger.debug("Waiting to exit scheduler...")
307
+ async with self.scheduler.exitCondition:
308
+ while True:
309
+ if self.exitMode:
310
+ break
311
+
312
+ # If we have still unfinished jobs or possible new tasks, wait
313
+ logger.debug(
314
+ "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
315
+ self.unfinishedJobs,
316
+ self.taskOutputQueueSize,
317
+ )
318
+ if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
319
+ break
320
+
321
+ # Wait for more news...
322
+ await self.scheduler.exitCondition.wait()
323
+
324
+ if self.failedJobs:
325
+ # Show some more information
326
+ from experimaestro.scheduler.jobs import (
327
+ JobStateError,
328
+ JobFailureStatus,
329
+ )
330
+
331
+ count = 0
332
+ for job in self.failedJobs.values():
333
+ # Skip dependency failures - only log direct failures
334
+ if isinstance(job.state, JobStateError):
335
+ if job.state.failure_reason != JobFailureStatus.DEPENDENCY:
336
+ count += 1
337
+ logger.error(
338
+ "Job %s failed, check the log file %s",
339
+ job.relpath,
340
+ job.stderr,
341
+ )
342
+ else:
343
+ # Should not happen, but count it anyway
344
+ count += 1
345
+ logger.error(
346
+ "Job %s failed, check the log file %s",
347
+ job.relpath,
348
+ job.stderr,
349
+ )
350
+ raise FailedExperiment(f"{count} failed jobs")
351
+
352
+ future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
353
+ return future.result()
354
+
355
+ def setenv(self, name, value, override=True):
356
+ """Shortcut to set the environment value"""
357
+ if override or name not in self.workspace.env:
358
+ logging.info("Setting environment: %s=%s", name, value)
359
+ self.workspace.env[name] = value
360
+
361
+ def token(self, name: str, count: int):
362
+ """Returns a token for this experiment
363
+
364
+ The token is the default token of the workspace connector"""
365
+ return self.workspace.connector.createtoken(name, count)
366
+
367
+ def __enter__(self):
368
+ from .dynamic_outputs import TaskOutputsWorker
369
+ from experimaestro.utils.environment import save_environment_info
370
+
371
+ if self.workspace.run_mode != RunMode.DRY_RUN:
372
+ logger.info("Locking experiment %s", self.xplockpath)
373
+ self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
374
+ logger.info("Experiment locked")
375
+
376
+ # Capture and save environment info (git info for editable packages + all package versions)
377
+ if self.workspace.run_mode == RunMode.NORMAL:
378
+ env_info_path = self.workdir / "environment.json"
379
+ save_environment_info(env_info_path)
380
+
381
+ # Move old jobs into "jobs.bak"
382
+ if self.workspace.run_mode == RunMode.NORMAL:
383
+ self.jobsbakpath.mkdir(exist_ok=True)
384
+ for p in self.jobspath.glob("*/*"):
385
+ if p.is_symlink():
386
+ target = self.jobsbakpath / p.relative_to(self.jobspath)
387
+ if target.is_symlink():
388
+ # Remove if duplicate
389
+ p.unlink()
390
+ else:
391
+ # Rename otherwise
392
+ target.parent.mkdir(parents=True, exist_ok=True)
393
+ p.rename(target)
394
+
395
+ # Register experiment with scheduler
396
+ self.scheduler.register_experiment(self)
397
+
398
+ # Start server via scheduler if needed
399
+ if self._needs_server:
400
+ self.scheduler.start_server(self._server_settings, workspace=self.workspace)
401
+
402
+ self.workspace.__enter__()
403
+ (self.workspace.path / ".__experimaestro__").touch()
404
+
405
+ # Initialize workspace state provider (singleton per workspace path)
406
+ from .state_provider import WorkspaceStateProvider
407
+
408
+ self.state_provider = WorkspaceStateProvider.get_instance(
409
+ self.workspace.path,
410
+ read_only=False,
411
+ sync_on_start=False, # Experiments don't sync on start
412
+ )
413
+
414
+ # Register experiment in database and create a run
415
+ experiment_id = self.workdir.name
416
+ self.state_provider.ensure_experiment(experiment_id)
417
+ self.run_id = self.state_provider.create_run(experiment_id)
418
+
419
+ # Add database listener to update job state in database
420
+ self._db_listener = DatabaseListener(
421
+ self.state_provider, experiment_id, self.run_id
422
+ )
423
+ self.scheduler.addlistener(self._db_listener)
424
+
425
+ # Number of unfinished jobs
426
+ self.unfinishedJobs = 0
427
+ self.taskOutputQueueSize = 0
428
+
429
+ # List of failed jobs
430
+ self.failedJobs: Dict[str, Job] = {}
431
+
432
+ # Exit mode when catching signals
433
+ self.exitMode = False
434
+
435
+ # Note: scheduler is already running as singleton
436
+ self.taskOutputsWorker = TaskOutputsWorker(self)
437
+ self.taskOutputsWorker.start()
438
+
439
+ if self._register_signals:
440
+ SIGNAL_HANDLER.add(self)
441
+
442
+ self.old_experiment = experiment.CURRENT
443
+ experiment.CURRENT = self
444
+ return self
445
+
446
+ def __exit__(self, exc_type, exc_value, traceback):
447
+ logger.debug("Exiting scheduler context")
448
+ # If no exception and normal run mode, remove old "jobs"
449
+ if self.workspace.run_mode == RunMode.NORMAL:
450
+ if exc_type is None and self.jobsbakpath.is_dir():
451
+ rmtree(self.jobsbakpath)
452
+
453
+ # Close the different locks
454
+ try:
455
+ if exc_type:
456
+ # import faulthandler
457
+ # faulthandler.dump_traceback()
458
+ logger.error(
459
+ "Not waiting since an exception was thrown"
460
+ " (some jobs may be running)"
461
+ )
462
+ else:
463
+ self.wait()
464
+ finally:
465
+ if self._register_signals:
466
+ SIGNAL_HANDLER.remove(self)
467
+
468
+ # Stop services
469
+ for service in self.services.values():
470
+ logger.info("Closing service %s", service.description())
471
+ service.stop()
472
+
473
+ # Unregister experiment from scheduler
474
+ self.scheduler.unregister_experiment(self)
475
+
476
+ # Remove database listener
477
+ self.scheduler.removelistener(self._db_listener)
478
+
479
+ # Mark run as completed in database
480
+ experiment_id = self.workdir.name
481
+ status = "failed" if exc_type else "completed"
482
+ self.state_provider.complete_run(experiment_id, self.run_id, status)
483
+
484
+ # Note: Don't stop scheduler - it's shared!
485
+ # Note: Don't stop server - it runs in daemon mode until program exit
486
+
487
+ if self.taskOutputsWorker is not None:
488
+ logger.info("Stopping tasks outputs worker")
489
+ self.taskOutputsWorker.queue.put(None)
490
+
491
+ self.workspace.__exit__(exc_type, exc_value, traceback)
492
+ if self.xplock:
493
+ self.xplock.__exit__(exc_type, exc_value, traceback)
494
+
495
+ # Put back old experiment as current one
496
+ experiment.CURRENT = self.old_experiment
497
+
498
+ if self.workspace.run_mode == RunMode.NORMAL:
499
+ # Write the state
500
+ logging.info("Saving the experiment state")
501
+ from experimaestro.scheduler.state import ExperimentState
502
+
503
+ ExperimentState.save(
504
+ self.workdir / "state.json", self.scheduler.jobs.values()
505
+ )
506
+
507
+ async def update_task_output_count(self, delta: int):
508
+ """Change in the number of task outputs to process"""
509
+ async with self.scheduler.exitCondition:
510
+ self.taskOutputQueueSize += delta
511
+ logging.debug(
512
+ "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
513
+ )
514
+ if self.taskOutputQueueSize == 0:
515
+ self.scheduler.exitCondition.notify_all()
516
+
517
+ def watch_output(self, watched: "WatchedOutput"):
518
+ """Watch an output
519
+
520
+ :param watched: The watched output specification
521
+ """
522
+
523
+ self.taskOutputsWorker.watch_output(watched)
524
+
525
+ def add_service(self, service: ServiceClass) -> ServiceClass:
526
+ """Adds a service (e.g. tensorboard viewer) to the experiment
527
+
528
+ :param service: A service instance
529
+ :return: The same service instance
530
+ """
531
+ self.services[service.id] = service
532
+
533
+ # Register database listener for state changes
534
+ service.add_listener(self._db_listener)
535
+
536
+ # Register file listener for state changes (writes to services.json)
537
+ service.add_listener(self)
538
+
539
+ self.scheduler.notify_service_add(service)
540
+
541
+ # Write services.json file
542
+ self._write_services_json()
543
+
544
+ return service
545
+
546
+ def service_state_changed(self, service):
547
+ """Called when a service state changes - update services.json"""
548
+ self._write_services_json()
549
+
550
+ def save(self, obj: Any, name: str = "default"):
551
+ """Serializes configurations.
552
+
553
+ Saves configuration objects within the experimental directory
554
+
555
+ :param obj: The object to save
556
+ :param name: The name of the saving directory (default to `default`)
557
+ """
558
+
559
+ if self.workspace.run_mode == RunMode.NORMAL:
560
+ from experimaestro import save
561
+
562
+ save_dir = self.workdir / "data" / name
563
+ save_dir.mkdir(exist_ok=True, parents=True)
564
+
565
+ save(obj, save_dir)
566
+
567
+ def load(self, reference: str, name: str = "default"):
568
+ """Serializes configurations.
569
+
570
+ Loads configuration objects from an experimental directory
571
+
572
+ :param reference: The name of the experiment
573
+ :param name: The name of the saving directory (default to `default`)
574
+ """
575
+ from experimaestro import load
576
+
577
+ path = self.workspace.experimentspath / reference / "data" / name
578
+ return load(path)
579
+
580
+
581
+ # re-export at the module level
582
+ current = experiment.current