experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (116) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +130 -5
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +107 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +489 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +225 -30
  37. experimaestro/scheduler/interfaces.py +474 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/services.py +186 -12
  40. experimaestro/scheduler/state_db.py +388 -0
  41. experimaestro/scheduler/state_provider.py +2345 -0
  42. experimaestro/scheduler/state_sync.py +834 -0
  43. experimaestro/scheduler/workspace.py +52 -10
  44. experimaestro/scriptbuilder.py +7 -0
  45. experimaestro/server/__init__.py +147 -57
  46. experimaestro/server/data/index.css +0 -125
  47. experimaestro/server/data/index.css.map +1 -1
  48. experimaestro/server/data/index.js +194 -58
  49. experimaestro/server/data/index.js.map +1 -1
  50. experimaestro/settings.py +44 -5
  51. experimaestro/sphinx/__init__.py +3 -3
  52. experimaestro/taskglobals.py +20 -0
  53. experimaestro/tests/conftest.py +80 -0
  54. experimaestro/tests/core/test_generics.py +2 -2
  55. experimaestro/tests/identifier_stability.json +45 -0
  56. experimaestro/tests/launchers/bin/sacct +6 -2
  57. experimaestro/tests/launchers/bin/sbatch +4 -2
  58. experimaestro/tests/launchers/test_slurm.py +80 -0
  59. experimaestro/tests/tasks/test_dynamic.py +231 -0
  60. experimaestro/tests/test_cli_jobs.py +615 -0
  61. experimaestro/tests/test_deprecated.py +630 -0
  62. experimaestro/tests/test_environment.py +200 -0
  63. experimaestro/tests/test_file_progress_integration.py +1 -1
  64. experimaestro/tests/test_forward.py +3 -3
  65. experimaestro/tests/test_identifier.py +372 -41
  66. experimaestro/tests/test_identifier_stability.py +458 -0
  67. experimaestro/tests/test_instance.py +3 -3
  68. experimaestro/tests/test_multitoken.py +442 -0
  69. experimaestro/tests/test_mypy.py +433 -0
  70. experimaestro/tests/test_objects.py +312 -5
  71. experimaestro/tests/test_outputs.py +2 -2
  72. experimaestro/tests/test_param.py +8 -12
  73. experimaestro/tests/test_partial_paths.py +231 -0
  74. experimaestro/tests/test_progress.py +0 -48
  75. experimaestro/tests/test_resumable_task.py +480 -0
  76. experimaestro/tests/test_serializers.py +141 -1
  77. experimaestro/tests/test_state_db.py +434 -0
  78. experimaestro/tests/test_subparameters.py +160 -0
  79. experimaestro/tests/test_tags.py +136 -0
  80. experimaestro/tests/test_tasks.py +107 -121
  81. experimaestro/tests/test_token_locking.py +252 -0
  82. experimaestro/tests/test_tokens.py +17 -13
  83. experimaestro/tests/test_types.py +123 -1
  84. experimaestro/tests/test_workspace_triggers.py +158 -0
  85. experimaestro/tests/token_reschedule.py +4 -2
  86. experimaestro/tests/utils.py +2 -2
  87. experimaestro/tokens.py +154 -57
  88. experimaestro/tools/diff.py +1 -1
  89. experimaestro/tui/__init__.py +8 -0
  90. experimaestro/tui/app.py +2303 -0
  91. experimaestro/tui/app.tcss +353 -0
  92. experimaestro/tui/log_viewer.py +228 -0
  93. experimaestro/utils/__init__.py +23 -0
  94. experimaestro/utils/environment.py +148 -0
  95. experimaestro/utils/git.py +129 -0
  96. experimaestro/utils/resources.py +1 -1
  97. experimaestro/version.py +34 -0
  98. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +68 -38
  99. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  100. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  101. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  102. experimaestro/compat.py +0 -6
  103. experimaestro/core/objects.pyi +0 -221
  104. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  105. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  106. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  107. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  108. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  109. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  110. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  111. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  112. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  113. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  114. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  115. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  116. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,9 @@
1
1
  import asyncio
2
+ import json
2
3
  import logging
3
4
  import os
4
5
  from pathlib import Path
6
+ import time
5
7
  from shutil import rmtree
6
8
  from typing import Any, Dict, Optional, TypeVar, Union
7
9
 
@@ -9,7 +11,7 @@ from experimaestro.core.objects import WatchedOutput
9
11
  from experimaestro.exceptions import HandledException
10
12
 
11
13
  from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
12
- from experimaestro.scheduler.jobs import Job, JobFailureStatus
14
+ from experimaestro.scheduler.jobs import Job
13
15
  from experimaestro.scheduler.services import Service
14
16
  from experimaestro.scheduler.workspace import RunMode, Workspace
15
17
  from experimaestro.settings import WorkspaceSettings, get_settings
@@ -24,15 +26,58 @@ class FailedExperiment(HandledException):
24
26
  pass
25
27
 
26
28
 
29
+ class DatabaseListener:
30
+ """Listener that updates job state in the database"""
31
+
32
+ def __init__(self, state_provider, experiment_id: str, run_id: str):
33
+ self.state_provider = state_provider
34
+ self.experiment_id = experiment_id
35
+ self.run_id = run_id
36
+
37
+ def job_submitted(self, job):
38
+ # Already handled in experiment.add_job()
39
+ pass
40
+
41
+ def job_state(self, job):
42
+ """Update job state in database"""
43
+ self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
44
+
45
+ def service_add(self, service):
46
+ """Update service in database"""
47
+ self.state_provider.update_service(
48
+ service.id,
49
+ self.experiment_id,
50
+ self.run_id,
51
+ service.description(),
52
+ service.state.name,
53
+ state_dict=json.dumps(service.state_dict()),
54
+ )
55
+
56
+ def service_state_changed(self, service):
57
+ """Update service state in database (called by Service when state changes)"""
58
+ self.state_provider.update_service(
59
+ service.id,
60
+ self.experiment_id,
61
+ self.run_id,
62
+ service.description(),
63
+ service.state.name,
64
+ state_dict=json.dumps(service.state_dict()),
65
+ )
66
+
67
+
27
68
  class experiment:
28
- """Main experiment object
69
+ """Context manager for running experiments.
70
+
71
+ Creates a workspace, manages task submission, and optionally starts
72
+ a web server for monitoring.
73
+
74
+ Example::
29
75
 
30
- It is a context object, i.e. an experiment is run with
76
+ from experimaestro import experiment
31
77
 
32
- ```py
33
- with experiment(...) as xp:
34
- ...
35
- ```
78
+ with experiment("./workdir", "my-experiment", port=12345) as xp:
79
+ task = MyTask.C(param=42).submit()
80
+ result = task.wait()
36
81
  """
37
82
 
38
83
  #: Current experiment
@@ -57,6 +102,7 @@ class experiment:
57
102
  token: Optional[str] = None,
58
103
  run_mode: Optional[RunMode] = None,
59
104
  launcher=None,
105
+ register_signals: bool = True,
60
106
  ):
61
107
  """
62
108
  :param env: an environment -- or a working directory for a local
@@ -73,9 +119,11 @@ class experiment:
73
119
 
74
120
  :param run_mode: The run mode for the experiment (normal, generate run
75
121
  files, dry run)
122
+
123
+ :param register_signals: Whether to register signal handlers (default: True).
124
+ Set to False when running in a background thread.
76
125
  """
77
126
 
78
- from experimaestro.server import Server
79
127
  from experimaestro.scheduler import Listener, Scheduler
80
128
 
81
129
  settings = get_settings()
@@ -94,6 +142,7 @@ class experiment:
94
142
  self.old_experiment = None
95
143
  self.services: Dict[str, Service] = {}
96
144
  self._job_listener: Optional[Listener] = None
145
+ self._register_signals = register_signals
97
146
 
98
147
  # Get configuration settings
99
148
 
@@ -106,14 +155,14 @@ class experiment:
106
155
  if token is not None:
107
156
  settings.server.token = token
108
157
 
109
- # Create the scheduler
110
- self.scheduler = Scheduler.create(self, name)
111
- self.server = (
112
- Server(self.scheduler, settings.server)
113
- if (settings.server.port is not None and settings.server.port >= 0)
114
- and self.workspace.run_mode == RunMode.NORMAL
115
- else None
116
- )
158
+ # Use singleton scheduler
159
+ self.scheduler = Scheduler.instance()
160
+
161
+ # Determine if we need a server
162
+ self._needs_server = (
163
+ settings.server.port is not None and settings.server.port >= 0
164
+ ) and self.workspace.run_mode == RunMode.NORMAL
165
+ self._server_settings = settings.server if self._needs_server else None
117
166
 
118
167
  if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
119
168
  import faulthandler
@@ -137,6 +186,11 @@ class experiment:
137
186
  assert self.scheduler is not None, "No scheduler defined"
138
187
  return self.scheduler.loop
139
188
 
189
+ @property
190
+ def server(self):
191
+ """Access the server via the scheduler"""
192
+ return self.scheduler.server if self.scheduler else None
193
+
140
194
  @property
141
195
  def resultspath(self):
142
196
  """Return the directory in which results can be stored for this experiment"""
@@ -158,6 +212,79 @@ class experiment:
158
212
  """Return the directory in which results can be stored for this experiment"""
159
213
  return self.workdir / "jobs.bak"
160
214
 
215
+ @property
216
+ def jobs_jsonl_path(self):
217
+ """Return the path to the jobs.jsonl file for this experiment"""
218
+ return self.workdir / "jobs.jsonl"
219
+
220
+ @property
221
+ def services_json_path(self):
222
+ """Return the path to the services.json file for this experiment"""
223
+ return self.workdir / "services.json"
224
+
225
+ def _write_services_json(self):
226
+ """Write all services to services.json file"""
227
+ services_data = {}
228
+ for service_id, service in self.services.items():
229
+ # Get state_dict from service (includes __class__ for recreation)
230
+ service_state = service.state_dict()
231
+ # Add runtime state info
232
+ service_state.update(
233
+ {
234
+ "service_id": service_id,
235
+ "description": service.description(),
236
+ "state": service.state.name,
237
+ "url": getattr(service, "url", None),
238
+ "timestamp": time.time(),
239
+ }
240
+ )
241
+ services_data[service_id] = service_state
242
+
243
+ with self.services_json_path.open("w") as f:
244
+ json.dump(services_data, f, indent=2)
245
+
246
+ def add_job(self, job: "Job"):
247
+ """Register a job and its tags to jobs.jsonl file and database
248
+
249
+ Note: For NEW jobs, the unfinishedJobs counter is updated by
250
+ job.set_state() when the state transitions from UNSCHEDULED.
251
+ For jobs already running, we increment here since no state
252
+ transition will occur.
253
+ """
254
+ from experimaestro.scheduler.interfaces import JobState
255
+
256
+ if self in job.experiments:
257
+ # Do not double register
258
+ return
259
+
260
+ # Track which experiments this job belongs to
261
+ job.experiments.append(self)
262
+
263
+ # If job is already being tracked (not UNSCHEDULED and not finished),
264
+ # increment unfinishedJobs since no state transition will trigger it
265
+ if job.state != JobState.UNSCHEDULED and not job.state.finished():
266
+ self.unfinishedJobs += 1
267
+ logging.debug(
268
+ "Job %s already running, unfinished jobs for %s: %d",
269
+ job.identifier[:8],
270
+ self.workdir.name,
271
+ self.unfinishedJobs,
272
+ )
273
+
274
+ record = {
275
+ "job_id": job.identifier,
276
+ "task_id": str(job.type.identifier),
277
+ "tags": dict(job.tags.items()) if job.tags else {},
278
+ "timestamp": time.time(),
279
+ }
280
+
281
+ with self.jobs_jsonl_path.open("a") as f:
282
+ f.write(json.dumps(record) + "\n")
283
+
284
+ # Also register in database for TUI/monitoring
285
+ experiment_id = self.workdir.name
286
+ self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
287
+
161
288
  def stop(self):
162
289
  """Stop the experiment as soon as possible"""
163
290
 
@@ -196,9 +323,24 @@ class experiment:
196
323
 
197
324
  if self.failedJobs:
198
325
  # Show some more information
326
+ from experimaestro.scheduler.jobs import (
327
+ JobStateError,
328
+ JobFailureStatus,
329
+ )
330
+
199
331
  count = 0
200
332
  for job in self.failedJobs.values():
201
- if job.failure_status != JobFailureStatus.DEPENDENCY:
333
+ # Skip dependency failures - only log direct failures
334
+ if isinstance(job.state, JobStateError):
335
+ if job.state.failure_reason != JobFailureStatus.DEPENDENCY:
336
+ count += 1
337
+ logger.error(
338
+ "Job %s failed, check the log file %s",
339
+ job.relpath,
340
+ job.stderr,
341
+ )
342
+ else:
343
+ # Should not happen, but count it anyway
202
344
  count += 1
203
345
  logger.error(
204
346
  "Job %s failed, check the log file %s",
@@ -224,12 +366,18 @@ class experiment:
224
366
 
225
367
  def __enter__(self):
226
368
  from .dynamic_outputs import TaskOutputsWorker
369
+ from experimaestro.utils.environment import save_environment_info
227
370
 
228
371
  if self.workspace.run_mode != RunMode.DRY_RUN:
229
372
  logger.info("Locking experiment %s", self.xplockpath)
230
373
  self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
231
374
  logger.info("Experiment locked")
232
375
 
376
+ # Capture and save environment info (git info for editable packages + all package versions)
377
+ if self.workspace.run_mode == RunMode.NORMAL:
378
+ env_info_path = self.workdir / "environment.json"
379
+ save_environment_info(env_info_path)
380
+
233
381
  # Move old jobs into "jobs.bak"
234
382
  if self.workspace.run_mode == RunMode.NORMAL:
235
383
  self.jobsbakpath.mkdir(exist_ok=True)
@@ -244,12 +392,36 @@ class experiment:
244
392
  target.parent.mkdir(parents=True, exist_ok=True)
245
393
  p.rename(target)
246
394
 
247
- if self.server:
248
- self.server.start()
395
+ # Register experiment with scheduler
396
+ self.scheduler.register_experiment(self)
397
+
398
+ # Start server via scheduler if needed
399
+ if self._needs_server:
400
+ self.scheduler.start_server(self._server_settings, workspace=self.workspace)
249
401
 
250
402
  self.workspace.__enter__()
251
403
  (self.workspace.path / ".__experimaestro__").touch()
252
404
 
405
+ # Initialize workspace state provider (singleton per workspace path)
406
+ from .state_provider import WorkspaceStateProvider
407
+
408
+ self.state_provider = WorkspaceStateProvider.get_instance(
409
+ self.workspace.path,
410
+ read_only=False,
411
+ sync_on_start=False, # Experiments don't sync on start
412
+ )
413
+
414
+ # Register experiment in database and create a run
415
+ experiment_id = self.workdir.name
416
+ self.state_provider.ensure_experiment(experiment_id)
417
+ self.run_id = self.state_provider.create_run(experiment_id)
418
+
419
+ # Add database listener to update job state in database
420
+ self._db_listener = DatabaseListener(
421
+ self.state_provider, experiment_id, self.run_id
422
+ )
423
+ self.scheduler.addlistener(self._db_listener)
424
+
253
425
  # Number of unfinished jobs
254
426
  self.unfinishedJobs = 0
255
427
  self.taskOutputQueueSize = 0
@@ -260,11 +432,12 @@ class experiment:
260
432
  # Exit mode when catching signals
261
433
  self.exitMode = False
262
434
 
263
- self.scheduler.start_scheduler()
435
+ # Note: scheduler is already running as singleton
264
436
  self.taskOutputsWorker = TaskOutputsWorker(self)
265
437
  self.taskOutputsWorker.start()
266
438
 
267
- SIGNAL_HANDLER.add(self)
439
+ if self._register_signals:
440
+ SIGNAL_HANDLER.add(self)
268
441
 
269
442
  self.old_experiment = experiment.CURRENT
270
443
  experiment.CURRENT = self
@@ -289,16 +462,27 @@ class experiment:
289
462
  else:
290
463
  self.wait()
291
464
  finally:
292
- SIGNAL_HANDLER.remove(self)
465
+ if self._register_signals:
466
+ SIGNAL_HANDLER.remove(self)
293
467
 
294
468
  # Stop services
295
469
  for service in self.services.values():
296
470
  logger.info("Closing service %s", service.description())
297
471
  service.stop()
298
472
 
299
- if self.scheduler is not None:
300
- logger.info("Stopping scheduler event loop")
301
- self.scheduler.loop.stop()
473
+ # Unregister experiment from scheduler
474
+ self.scheduler.unregister_experiment(self)
475
+
476
+ # Remove database listener
477
+ self.scheduler.removelistener(self._db_listener)
478
+
479
+ # Mark run as completed in database
480
+ experiment_id = self.workdir.name
481
+ status = "failed" if exc_type else "completed"
482
+ self.state_provider.complete_run(experiment_id, self.run_id, status)
483
+
484
+ # Note: Don't stop scheduler - it's shared!
485
+ # Note: Don't stop server - it runs in daemon mode until program exit
302
486
 
303
487
  if self.taskOutputsWorker is not None:
304
488
  logger.info("Stopping tasks outputs worker")
@@ -310,9 +494,6 @@ class experiment:
310
494
 
311
495
  # Put back old experiment as current one
312
496
  experiment.CURRENT = self.old_experiment
313
- if self.server:
314
- logger.info("Stopping web server")
315
- self.server.stop()
316
497
 
317
498
  if self.workspace.run_mode == RunMode.NORMAL:
318
499
  # Write the state
@@ -348,10 +529,24 @@ class experiment:
348
529
  :return: The same service instance
349
530
  """
350
531
  self.services[service.id] = service
351
- for listener in self.scheduler.listeners:
352
- listener.service_add(service)
532
+
533
+ # Register database listener for state changes
534
+ service.add_listener(self._db_listener)
535
+
536
+ # Register file listener for state changes (writes to services.json)
537
+ service.add_listener(self)
538
+
539
+ self.scheduler.notify_service_add(service)
540
+
541
+ # Write services.json file
542
+ self._write_services_json()
543
+
353
544
  return service
354
545
 
546
+ def service_state_changed(self, service):
547
+ """Called when a service state changes - update services.json"""
548
+ self._write_services_json()
549
+
355
550
  def save(self, obj: Any, name: str = "default"):
356
551
  """Serializes configurations.
357
552