experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +278 -7
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +111 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +510 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +256 -31
  37. experimaestro/scheduler/interfaces.py +501 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/remote/__init__.py +31 -0
  40. experimaestro/scheduler/remote/client.py +874 -0
  41. experimaestro/scheduler/remote/protocol.py +467 -0
  42. experimaestro/scheduler/remote/server.py +423 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +323 -23
  45. experimaestro/scheduler/state_db.py +437 -0
  46. experimaestro/scheduler/state_provider.py +2766 -0
  47. experimaestro/scheduler/state_sync.py +891 -0
  48. experimaestro/scheduler/workspace.py +52 -10
  49. experimaestro/scriptbuilder.py +7 -0
  50. experimaestro/server/__init__.py +147 -57
  51. experimaestro/server/data/index.css +0 -125
  52. experimaestro/server/data/index.css.map +1 -1
  53. experimaestro/server/data/index.js +194 -58
  54. experimaestro/server/data/index.js.map +1 -1
  55. experimaestro/settings.py +44 -5
  56. experimaestro/sphinx/__init__.py +3 -3
  57. experimaestro/taskglobals.py +20 -0
  58. experimaestro/tests/conftest.py +80 -0
  59. experimaestro/tests/core/test_generics.py +2 -2
  60. experimaestro/tests/identifier_stability.json +45 -0
  61. experimaestro/tests/launchers/bin/sacct +6 -2
  62. experimaestro/tests/launchers/bin/sbatch +4 -2
  63. experimaestro/tests/launchers/test_slurm.py +80 -0
  64. experimaestro/tests/tasks/test_dynamic.py +231 -0
  65. experimaestro/tests/test_cli_jobs.py +615 -0
  66. experimaestro/tests/test_deprecated.py +630 -0
  67. experimaestro/tests/test_environment.py +200 -0
  68. experimaestro/tests/test_file_progress_integration.py +1 -1
  69. experimaestro/tests/test_forward.py +3 -3
  70. experimaestro/tests/test_identifier.py +372 -41
  71. experimaestro/tests/test_identifier_stability.py +458 -0
  72. experimaestro/tests/test_instance.py +3 -3
  73. experimaestro/tests/test_multitoken.py +442 -0
  74. experimaestro/tests/test_mypy.py +433 -0
  75. experimaestro/tests/test_objects.py +312 -5
  76. experimaestro/tests/test_outputs.py +2 -2
  77. experimaestro/tests/test_param.py +8 -12
  78. experimaestro/tests/test_partial_paths.py +231 -0
  79. experimaestro/tests/test_progress.py +0 -48
  80. experimaestro/tests/test_remote_state.py +671 -0
  81. experimaestro/tests/test_resumable_task.py +480 -0
  82. experimaestro/tests/test_serializers.py +141 -1
  83. experimaestro/tests/test_state_db.py +434 -0
  84. experimaestro/tests/test_subparameters.py +160 -0
  85. experimaestro/tests/test_tags.py +136 -0
  86. experimaestro/tests/test_tasks.py +107 -121
  87. experimaestro/tests/test_token_locking.py +252 -0
  88. experimaestro/tests/test_tokens.py +17 -13
  89. experimaestro/tests/test_types.py +123 -1
  90. experimaestro/tests/test_workspace_triggers.py +158 -0
  91. experimaestro/tests/token_reschedule.py +4 -2
  92. experimaestro/tests/utils.py +2 -2
  93. experimaestro/tokens.py +154 -57
  94. experimaestro/tools/diff.py +1 -1
  95. experimaestro/tui/__init__.py +8 -0
  96. experimaestro/tui/app.py +2395 -0
  97. experimaestro/tui/app.tcss +353 -0
  98. experimaestro/tui/log_viewer.py +228 -0
  99. experimaestro/utils/__init__.py +23 -0
  100. experimaestro/utils/environment.py +148 -0
  101. experimaestro/utils/git.py +129 -0
  102. experimaestro/utils/resources.py +1 -1
  103. experimaestro/version.py +34 -0
  104. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
  105. experimaestro-2.0.0b8.dist-info/RECORD +187 -0
  106. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
  107. experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
  108. experimaestro/compat.py +0 -6
  109. experimaestro/core/objects.pyi +0 -221
  110. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  111. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  112. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  113. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  114. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  115. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  116. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  117. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  118. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  119. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  120. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  121. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  122. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,9 @@
1
1
  import asyncio
2
+ import json
2
3
  import logging
3
4
  import os
4
5
  from pathlib import Path
6
+ import time
5
7
  from shutil import rmtree
6
8
  from typing import Any, Dict, Optional, TypeVar, Union
7
9
 
@@ -9,7 +11,7 @@ from experimaestro.core.objects import WatchedOutput
9
11
  from experimaestro.exceptions import HandledException
10
12
 
11
13
  from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
12
- from experimaestro.scheduler.jobs import Job, JobFailureStatus
14
+ from experimaestro.scheduler.jobs import Job
13
15
  from experimaestro.scheduler.services import Service
14
16
  from experimaestro.scheduler.workspace import RunMode, Workspace
15
17
  from experimaestro.settings import WorkspaceSettings, get_settings
@@ -24,15 +26,54 @@ class FailedExperiment(HandledException):
24
26
  pass
25
27
 
26
28
 
29
+ class DatabaseListener:
30
+ """Listener that updates job state in the database"""
31
+
32
+ def __init__(self, state_provider, experiment_id: str, run_id: str):
33
+ self.state_provider = state_provider
34
+ self.experiment_id = experiment_id
35
+ self.run_id = run_id
36
+
37
+ def job_submitted(self, job):
38
+ # Already handled in experiment.add_job()
39
+ pass
40
+
41
+ def job_state(self, job):
42
+ """Update job state in database"""
43
+ self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
44
+
45
+ def service_add(self, service):
46
+ """Register service in database"""
47
+ from experimaestro.scheduler.services import Service
48
+
49
+ state_dict = Service.serialize_state_dict(service._full_state_dict())
50
+ self.state_provider.register_service(
51
+ service.id,
52
+ self.experiment_id,
53
+ self.run_id,
54
+ service.description(),
55
+ state_dict=json.dumps(state_dict),
56
+ )
57
+
58
+ def service_state_changed(self, service):
59
+ """Called when service state changes (runtime only, not persisted)"""
60
+ # Service state is managed at runtime, not persisted to DB
61
+ pass
62
+
63
+
27
64
  class experiment:
28
- """Main experiment object
65
+ """Context manager for running experiments.
66
+
67
+ Creates a workspace, manages task submission, and optionally starts
68
+ a web server for monitoring.
29
69
 
30
- It is a context object, i.e. an experiment is run with
70
+ Example::
31
71
 
32
- ```py
33
- with experiment(...) as xp:
34
- ...
35
- ```
72
+ from experimaestro import experiment
73
+
74
+ with experiment("./workdir", "my-experiment", port=12345) as xp:
75
+ task = MyTask.C(param=42).submit()
76
+ result = task.wait()
36
77
  """
37
78
 
38
79
  #: Current experiment
@@ -57,6 +98,7 @@ class experiment:
57
98
  token: Optional[str] = None,
58
99
  run_mode: Optional[RunMode] = None,
59
100
  launcher=None,
101
+ register_signals: bool = True,
60
102
  ):
61
103
  """
62
104
  :param env: an environment -- or a working directory for a local
@@ -73,9 +115,11 @@ class experiment:
73
115
 
74
116
  :param run_mode: The run mode for the experiment (normal, generate run
75
117
  files, dry run)
118
+
119
+ :param register_signals: Whether to register signal handlers (default: True).
120
+ Set to False when running in a background thread.
76
121
  """
77
122
 
78
- from experimaestro.server import Server
79
123
  from experimaestro.scheduler import Listener, Scheduler
80
124
 
81
125
  settings = get_settings()
@@ -94,6 +138,7 @@ class experiment:
94
138
  self.old_experiment = None
95
139
  self.services: Dict[str, Service] = {}
96
140
  self._job_listener: Optional[Listener] = None
141
+ self._register_signals = register_signals
97
142
 
98
143
  # Get configuration settings
99
144
 
@@ -106,14 +151,14 @@ class experiment:
106
151
  if token is not None:
107
152
  settings.server.token = token
108
153
 
109
- # Create the scheduler
110
- self.scheduler = Scheduler.create(self, name)
111
- self.server = (
112
- Server(self.scheduler, settings.server)
113
- if (settings.server.port is not None and settings.server.port >= 0)
114
- and self.workspace.run_mode == RunMode.NORMAL
115
- else None
116
- )
154
+ # Use singleton scheduler
155
+ self.scheduler = Scheduler.instance()
156
+
157
+ # Determine if we need a server
158
+ self._needs_server = (
159
+ settings.server.port is not None and settings.server.port >= 0
160
+ ) and self.workspace.run_mode == RunMode.NORMAL
161
+ self._server_settings = settings.server if self._needs_server else None
117
162
 
118
163
  if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
119
164
  import faulthandler
@@ -137,6 +182,11 @@ class experiment:
137
182
  assert self.scheduler is not None, "No scheduler defined"
138
183
  return self.scheduler.loop
139
184
 
185
+ @property
186
+ def server(self):
187
+ """Access the server via the scheduler"""
188
+ return self.scheduler.server if self.scheduler else None
189
+
140
190
  @property
141
191
  def resultspath(self):
142
192
  """Return the directory in which results can be stored for this experiment"""
@@ -158,6 +208,83 @@ class experiment:
158
208
  """Return the directory in which results can be stored for this experiment"""
159
209
  return self.workdir / "jobs.bak"
160
210
 
211
+ @property
212
+ def jobs_jsonl_path(self):
213
+ """Return the path to the jobs.jsonl file for this experiment"""
214
+ return self.workdir / "jobs.jsonl"
215
+
216
+ @property
217
+ def services_json_path(self):
218
+ """Return the path to the services.json file for this experiment"""
219
+ return self.workdir / "services.json"
220
+
221
+ def _write_services_json(self):
222
+ """Write all services to services.json file"""
223
+ from experimaestro.scheduler.services import Service
224
+
225
+ services_data = {}
226
+ for service_id, service in self.services.items():
227
+ # Get state_dict from service (includes __class__ for recreation)
228
+ # and serialize paths to JSON-compatible format
229
+ service_state = Service.serialize_state_dict(service._full_state_dict())
230
+ # Add runtime state info
231
+ service_state.update(
232
+ {
233
+ "service_id": service_id,
234
+ "description": service.description(),
235
+ "state": service.state.name,
236
+ "url": getattr(service, "url", None),
237
+ "timestamp": time.time(),
238
+ }
239
+ )
240
+ services_data[service_id] = service_state
241
+
242
+ with self.services_json_path.open("w") as f:
243
+ json.dump(services_data, f, indent=2)
244
+
245
+ def add_job(self, job: "Job"):
246
+ """Register a job and its tags to jobs.jsonl file and database
247
+
248
+ Note: For NEW jobs, the unfinishedJobs counter is updated by
249
+ job.set_state() when the state transitions from UNSCHEDULED.
250
+ For jobs already running, we increment here since no state
251
+ transition will occur.
252
+ """
253
+ from experimaestro.scheduler.interfaces import JobState
254
+
255
+ if self in job.experiments:
256
+ # Do not double register
257
+ return
258
+
259
+ # Track which experiments this job belongs to
260
+ job.experiments.append(self)
261
+
262
+ # If job is already being tracked (not UNSCHEDULED and not finished),
263
+ # increment unfinishedJobs since no state transition will trigger it
264
+ if job.state != JobState.UNSCHEDULED and not job.state.finished():
265
+ self.unfinishedJobs += 1
266
+ logging.debug(
267
+ "Job %s already running, unfinished jobs for %s: %d",
268
+ job.identifier[:8],
269
+ self.workdir.name,
270
+ self.unfinishedJobs,
271
+ )
272
+
273
+ record = {
274
+ "job_id": job.identifier,
275
+ "task_id": str(job.type.identifier),
276
+ "tags": dict(job.tags.items()) if job.tags else {},
277
+ "timestamp": time.time(),
278
+ }
279
+
280
+ with self.jobs_jsonl_path.open("a") as f:
281
+ f.write(json.dumps(record) + "\n")
282
+
283
+ # Also register in database for TUI/monitoring (only in NORMAL mode)
284
+ if self._db_listener is not None:
285
+ experiment_id = self.workdir.name
286
+ self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
287
+
161
288
  def stop(self):
162
289
  """Stop the experiment as soon as possible"""
163
290
 
@@ -196,9 +323,24 @@ class experiment:
196
323
 
197
324
  if self.failedJobs:
198
325
  # Show some more information
326
+ from experimaestro.scheduler.jobs import (
327
+ JobStateError,
328
+ JobFailureStatus,
329
+ )
330
+
199
331
  count = 0
200
332
  for job in self.failedJobs.values():
201
- if job.failure_status != JobFailureStatus.DEPENDENCY:
333
+ # Skip dependency failures - only log direct failures
334
+ if isinstance(job.state, JobStateError):
335
+ if job.state.failure_reason != JobFailureStatus.DEPENDENCY:
336
+ count += 1
337
+ logger.error(
338
+ "Job %s failed, check the log file %s",
339
+ job.relpath,
340
+ job.stderr,
341
+ )
342
+ else:
343
+ # Should not happen, but count it anyway
202
344
  count += 1
203
345
  logger.error(
204
346
  "Job %s failed, check the log file %s",
@@ -224,12 +366,18 @@ class experiment:
224
366
 
225
367
  def __enter__(self):
226
368
  from .dynamic_outputs import TaskOutputsWorker
369
+ from experimaestro.utils.environment import save_environment_info
227
370
 
228
371
  if self.workspace.run_mode != RunMode.DRY_RUN:
229
372
  logger.info("Locking experiment %s", self.xplockpath)
230
373
  self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
231
374
  logger.info("Experiment locked")
232
375
 
376
+ # Capture and save environment info (git info for editable packages + all package versions)
377
+ if self.workspace.run_mode == RunMode.NORMAL:
378
+ env_info_path = self.workdir / "environment.json"
379
+ save_environment_info(env_info_path)
380
+
233
381
  # Move old jobs into "jobs.bak"
234
382
  if self.workspace.run_mode == RunMode.NORMAL:
235
383
  self.jobsbakpath.mkdir(exist_ok=True)
@@ -244,12 +392,43 @@ class experiment:
244
392
  target.parent.mkdir(parents=True, exist_ok=True)
245
393
  p.rename(target)
246
394
 
247
- if self.server:
248
- self.server.start()
395
+ # Register experiment with scheduler
396
+ self.scheduler.register_experiment(self)
397
+
398
+ # Start server via scheduler if needed
399
+ if self._needs_server:
400
+ self.scheduler.start_server(self._server_settings, workspace=self.workspace)
249
401
 
250
402
  self.workspace.__enter__()
251
403
  (self.workspace.path / ".__experimaestro__").touch()
252
404
 
405
+ # Initialize workspace state provider (singleton per workspace path)
406
+ # Use read_only mode when not in NORMAL run mode to prevent DB changes
407
+ from .state_provider import WorkspaceStateProvider
408
+
409
+ is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
410
+ self.state_provider = WorkspaceStateProvider.get_instance(
411
+ self.workspace.path,
412
+ read_only=not is_normal_mode,
413
+ sync_on_start=False, # Experiments don't sync on start
414
+ )
415
+
416
+ # Register experiment in database and create a run (only in NORMAL mode)
417
+ experiment_id = self.workdir.name
418
+ self._db_listener = None
419
+ if is_normal_mode:
420
+ self.state_provider.ensure_experiment(experiment_id)
421
+ self.run_id = self.state_provider.create_run(experiment_id)
422
+
423
+ # Add database listener to update job state in database
424
+ self._db_listener = DatabaseListener(
425
+ self.state_provider, experiment_id, self.run_id
426
+ )
427
+ self.scheduler.addlistener(self._db_listener)
428
+ else:
429
+ # In non-NORMAL modes, use a placeholder run_id
430
+ self.run_id = None
431
+
253
432
  # Number of unfinished jobs
254
433
  self.unfinishedJobs = 0
255
434
  self.taskOutputQueueSize = 0
@@ -260,11 +439,12 @@ class experiment:
260
439
  # Exit mode when catching signals
261
440
  self.exitMode = False
262
441
 
263
- self.scheduler.start_scheduler()
442
+ # Note: scheduler is already running as singleton
264
443
  self.taskOutputsWorker = TaskOutputsWorker(self)
265
444
  self.taskOutputsWorker.start()
266
445
 
267
- SIGNAL_HANDLER.add(self)
446
+ if self._register_signals:
447
+ SIGNAL_HANDLER.add(self)
268
448
 
269
449
  self.old_experiment = experiment.CURRENT
270
450
  experiment.CURRENT = self
@@ -288,17 +468,33 @@ class experiment:
288
468
  )
289
469
  else:
290
470
  self.wait()
471
+
472
+ # Wait for all pending notifications to be processed
473
+ # before removing listeners
474
+ self.scheduler.wait_for_notifications()
291
475
  finally:
292
- SIGNAL_HANDLER.remove(self)
476
+ if self._register_signals:
477
+ SIGNAL_HANDLER.remove(self)
293
478
 
294
479
  # Stop services
295
480
  for service in self.services.values():
296
481
  logger.info("Closing service %s", service.description())
297
482
  service.stop()
298
483
 
299
- if self.scheduler is not None:
300
- logger.info("Stopping scheduler event loop")
301
- self.scheduler.loop.stop()
484
+ # Unregister experiment from scheduler
485
+ self.scheduler.unregister_experiment(self)
486
+
487
+ # Remove database listener and mark run as completed (only in NORMAL mode)
488
+ if self._db_listener is not None:
489
+ self.scheduler.removelistener(self._db_listener)
490
+
491
+ # Mark run as completed in database
492
+ experiment_id = self.workdir.name
493
+ status = "failed" if exc_type else "completed"
494
+ self.state_provider.complete_run(experiment_id, self.run_id, status)
495
+
496
+ # Note: Don't stop scheduler - it's shared!
497
+ # Note: Don't stop server - it runs in daemon mode until program exit
302
498
 
303
499
  if self.taskOutputsWorker is not None:
304
500
  logger.info("Stopping tasks outputs worker")
@@ -310,9 +506,6 @@ class experiment:
310
506
 
311
507
  # Put back old experiment as current one
312
508
  experiment.CURRENT = self.old_experiment
313
- if self.server:
314
- logger.info("Stopping web server")
315
- self.server.stop()
316
509
 
317
510
  if self.workspace.run_mode == RunMode.NORMAL:
318
511
  # Write the state
@@ -345,13 +538,45 @@ class experiment:
345
538
  """Adds a service (e.g. tensorboard viewer) to the experiment
346
539
 
347
540
  :param service: A service instance
348
- :return: The same service instance
541
+ :return: The same service instance (or existing service if already added)
349
542
  """
543
+ existing = self.services.get(service.id)
544
+ if existing is not None:
545
+ if existing is service:
546
+ # Same service instance added twice - just return it
547
+ logger.debug("Service %s already added, ignoring duplicate", service.id)
548
+ return service
549
+ else:
550
+ # Different service with same id - warn and replace
551
+ logger.warning(
552
+ "Replacing service %s (old id=%s, new id=%s)",
553
+ service.id,
554
+ id(existing),
555
+ id(service),
556
+ )
557
+
350
558
  self.services[service.id] = service
351
- for listener in self.scheduler.listeners:
352
- listener.service_add(service)
559
+
560
+ # Allow service to access experiment context
561
+ service.set_experiment(self)
562
+
563
+ # Register database listener for state changes
564
+ service.add_listener(self._db_listener)
565
+
566
+ # Register file listener for state changes (writes to services.json)
567
+ service.add_listener(self)
568
+
569
+ self.scheduler.notify_service_add(service)
570
+
571
+ # Write services.json file
572
+ self._write_services_json()
573
+
353
574
  return service
354
575
 
576
+ def service_state_changed(self, service):
577
+ """Called when a service state changes - update services.json"""
578
+ self._write_services_json()
579
+
355
580
  def save(self, obj: Any, name: str = "default"):
356
581
  """Serializes configurations.
357
582