PyPI - experimaestro - Versions diffs - 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl - Mend

experimaestro 1.11.1py3-none-any.whl → 2.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show

experimaestro/__init__.py +10 -11
experimaestro/annotations.py +167 -206
experimaestro/cli/__init__.py +140 -16
experimaestro/cli/filter.py +42 -74
experimaestro/cli/jobs.py +157 -106
experimaestro/cli/progress.py +269 -0
experimaestro/cli/refactor.py +249 -0
experimaestro/click.py +0 -1
experimaestro/commandline.py +19 -3
experimaestro/connectors/__init__.py +22 -3
experimaestro/connectors/local.py +12 -0
experimaestro/core/arguments.py +192 -37
experimaestro/core/identifier.py +127 -12
experimaestro/core/objects/__init__.py +6 -0
experimaestro/core/objects/config.py +702 -285
experimaestro/core/objects/config_walk.py +24 -6
experimaestro/core/serialization.py +91 -34
experimaestro/core/serializers.py +1 -8
experimaestro/core/subparameters.py +164 -0
experimaestro/core/types.py +198 -83
experimaestro/exceptions.py +26 -0
experimaestro/experiments/cli.py +107 -25
experimaestro/generators.py +50 -9
experimaestro/huggingface.py +3 -1
experimaestro/launcherfinder/parser.py +29 -0
experimaestro/launcherfinder/registry.py +3 -3
experimaestro/launchers/__init__.py +26 -1
experimaestro/launchers/direct.py +12 -0
experimaestro/launchers/slurm/base.py +154 -2
experimaestro/mkdocs/base.py +6 -8
experimaestro/mkdocs/metaloader.py +0 -1
experimaestro/mypy.py +452 -7
experimaestro/notifications.py +75 -16
experimaestro/progress.py +404 -0
experimaestro/rpyc.py +0 -1
experimaestro/run.py +19 -6
experimaestro/scheduler/__init__.py +18 -1
experimaestro/scheduler/base.py +504 -959
experimaestro/scheduler/dependencies.py +43 -28
experimaestro/scheduler/dynamic_outputs.py +259 -130
experimaestro/scheduler/experiment.py +582 -0
experimaestro/scheduler/interfaces.py +474 -0
experimaestro/scheduler/jobs.py +485 -0
experimaestro/scheduler/services.py +186 -12
experimaestro/scheduler/signal_handler.py +32 -0
experimaestro/scheduler/state.py +1 -1
experimaestro/scheduler/state_db.py +388 -0
experimaestro/scheduler/state_provider.py +2345 -0
experimaestro/scheduler/state_sync.py +834 -0
experimaestro/scheduler/workspace.py +52 -10
experimaestro/scriptbuilder.py +7 -0
experimaestro/server/__init__.py +153 -32
experimaestro/server/data/index.css +0 -125
experimaestro/server/data/index.css.map +1 -1
experimaestro/server/data/index.js +194 -58
experimaestro/server/data/index.js.map +1 -1
experimaestro/settings.py +47 -6
experimaestro/sphinx/__init__.py +3 -3
experimaestro/taskglobals.py +20 -0
experimaestro/tests/conftest.py +80 -0
experimaestro/tests/core/test_generics.py +2 -2
experimaestro/tests/identifier_stability.json +45 -0
experimaestro/tests/launchers/bin/sacct +6 -2
experimaestro/tests/launchers/bin/sbatch +4 -2
experimaestro/tests/launchers/common.py +2 -2
experimaestro/tests/launchers/test_slurm.py +80 -0
experimaestro/tests/restart.py +1 -1
experimaestro/tests/tasks/all.py +7 -0
experimaestro/tests/tasks/test_dynamic.py +231 -0
experimaestro/tests/test_checkers.py +2 -2
experimaestro/tests/test_cli_jobs.py +615 -0
experimaestro/tests/test_dependencies.py +11 -17
experimaestro/tests/test_deprecated.py +630 -0
experimaestro/tests/test_environment.py +200 -0
experimaestro/tests/test_experiment.py +3 -3
experimaestro/tests/test_file_progress.py +425 -0
experimaestro/tests/test_file_progress_integration.py +477 -0
experimaestro/tests/test_forward.py +3 -3
experimaestro/tests/test_generators.py +93 -0
experimaestro/tests/test_identifier.py +520 -169
experimaestro/tests/test_identifier_stability.py +458 -0
experimaestro/tests/test_instance.py +16 -21
experimaestro/tests/test_multitoken.py +442 -0
experimaestro/tests/test_mypy.py +433 -0
experimaestro/tests/test_objects.py +314 -30
experimaestro/tests/test_outputs.py +8 -8
experimaestro/tests/test_param.py +22 -26
experimaestro/tests/test_partial_paths.py +231 -0
experimaestro/tests/test_progress.py +2 -50
experimaestro/tests/test_resumable_task.py +480 -0
experimaestro/tests/test_serializers.py +141 -60
experimaestro/tests/test_state_db.py +434 -0
experimaestro/tests/test_subparameters.py +160 -0
experimaestro/tests/test_tags.py +151 -15
experimaestro/tests/test_tasks.py +137 -160
experimaestro/tests/test_token_locking.py +252 -0
experimaestro/tests/test_tokens.py +25 -19
experimaestro/tests/test_types.py +133 -11
experimaestro/tests/test_validation.py +19 -19
experimaestro/tests/test_workspace_triggers.py +158 -0
experimaestro/tests/token_reschedule.py +5 -3
experimaestro/tests/utils.py +2 -2
experimaestro/tokens.py +154 -57
experimaestro/tools/diff.py +8 -1
experimaestro/tui/__init__.py +8 -0
experimaestro/tui/app.py +2303 -0
experimaestro/tui/app.tcss +353 -0
experimaestro/tui/log_viewer.py +228 -0
experimaestro/typingutils.py +11 -2
experimaestro/utils/__init__.py +23 -0
experimaestro/utils/environment.py +148 -0
experimaestro/utils/git.py +129 -0
experimaestro/utils/resources.py +1 -1
experimaestro/version.py +34 -0
{experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
experimaestro-2.0.0b4.dist-info/RECORD +181 -0
{experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
experimaestro/compat.py +0 -6
experimaestro/core/objects.pyi +0 -225
experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
experimaestro-1.11.1.dist-info/RECORD +0 -158
experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
{experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0

experimaestro/scheduler/base.py CHANGED Viewed

@@ -1,503 +1,210 @@
-from collections import ChainMap
-from functools import cached_property
-import itertools
-import logging
-import os
-from pathlib import Path
-from shutil import rmtree
 import threading
 import time
 from typing import (
-    Any,
-    Iterator,
-    List,
     Optional,
     Set,
-    TypeVar,
-    Union,
+    ClassVar,
     TYPE_CHECKING,
 )
-import enum
-import signal
 import asyncio
-from experimaestro.exceptions import HandledException
-from experimaestro.notifications import LevelInformation, Reporter
 from typing import Dict
+from experimaestro.scheduler import experiment
+from experimaestro.scheduler.jobs import Job, JobState, JobError
 from experimaestro.scheduler.services import Service
-from experimaestro.settings import WorkspaceSettings, get_settings
-from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
 from experimaestro.utils import logger
-from experimaestro.locking import Locks, LockError, Lock
 from experimaestro.utils.asyncio import asyncThreadcheck
-from .workspace import RunMode, Workspace
-from .dependencies import Dependency, DependencyStatus, Resource
 import concurrent.futures
 if TYPE_CHECKING:
-    from experimaestro.connectors import Process
-    from experimaestro.launchers import Launcher
-class FailedExperiment(HandledException):
-    """Raised when an experiment failed"""
-    pass
-class JobState(enum.Enum):
-    # Job is not yet scheduled
-    UNSCHEDULED = 0
-    # Job is waiting for dependencies to be done
-    WAITING = 1
-    # Job is ready to run
-    READY = 2
-    # Job is scheduled (e.g. slurm)
-    SCHEDULED = 3
-    # Job is running
-    RUNNING = 4
-    # Job is done (finished)
-    DONE = 5
-    # Job failed (finished)
-    ERROR = 6
-    def notstarted(self):
-        return self.value <= JobState.READY.value
-    def running(self):
-        return (
-            self.value == JobState.RUNNING.value
-            or self.value == JobState.SCHEDULED.value
-        )
-    def finished(self):
-        return self.value >= JobState.DONE.value
+    from experimaestro.server import Server
+    from experimaestro.settings import ServerSettings
+    from experimaestro.scheduler.workspace import Workspace
-class JobFailureStatus(enum.Enum):
-    #: Job failed
-    DEPENDENCY = 0
-    #: Job dependency failed
-    FAILED = 1
-    #: Memory
-    MEMORY = 2
-class JobLock(Lock):
-    def __init__(self, job):
-        super().__init__()
-        self.job = job
+class Listener:
+    def job_submitted(self, job):
+        pass
-    def _acquire(self):
-        return self.job.state == JobState.DONE
+    def job_state(self, job):
+        pass
-    def _release(self):
-        return False
+    def service_add(self, service: Service):
+        """Notify when a new service is added"""
+        pass
-class JobDependency(Dependency):
-    def __init__(self, job):
-        super().__init__(job)
+class Scheduler(threading.Thread):
+    """A job scheduler (singleton)
-    def status(self) -> DependencyStatus:
-        if self.origin.state == JobState.DONE:
-            return DependencyStatus.OK
-        elif self.origin.state == JobState.ERROR:
-            return DependencyStatus.FAIL
-        return DependencyStatus.WAIT
+    The scheduler is based on asyncio for easy concurrency handling.
+    This is a singleton - only one scheduler instance exists per process.
+    """
-    def lock(self):
-        return JobLock(self.origin)
+    _instance: ClassVar[Optional["Scheduler"]] = None
+    _lock: ClassVar[threading.Lock] = threading.Lock()
+    def __init__(self, name: str = "Global"):
+        super().__init__(name=f"Scheduler ({name})", daemon=True)
+        self._ready = threading.Event()
-class Job(Resource):
-    """A job is a resource that is produced by the execution of some code"""
+        # Name of the scheduler
+        self.name = name
-    # Set by the scheduler
-    _readyEvent: Optional[asyncio.Event]
-    _future: Optional["concurrent.futures.Future"]
+        # Track experiments (simple dict for now)
+        self.experiments: Dict[str, "experiment"] = {}
-    def __init__(
-        self,
-        config: Config,
-        *,
-        workspace: Workspace = None,
-        launcher: "Launcher" = None,
-        run_mode: RunMode = RunMode.NORMAL,
-    ):
-        super().__init__()
+        # Exit mode activated
+        self.exitmode = False
-        self.workspace = workspace or Workspace.CURRENT
-        self.launcher = launcher or self.workspace.launcher if self.workspace else None
+        # List of all jobs
+        self.jobs: Dict[str, "Job"] = {}
-        if run_mode == RunMode.NORMAL:
-            assert self.workspace is not None, "No experiment has been defined"
-            assert self.launcher is not None, (
-                "No launcher, and no default defined for the workspace %s" % workspace
-            )
+        # List of jobs
+        self.waitingjobs: Set[Job] = set()
-        self.type = config.__xpmtype__
-        self.name = str(self.type.identifier).rsplit(".", 1)[-1]
+        # Listeners with thread-safe access
+        self._listeners: Set[Listener] = set()
+        self._listeners_lock = threading.Lock()
-        self.scheduler: Optional["Scheduler"] = None
-        self.config = config
-        self.state: JobState = JobState.UNSCHEDULED
+        # Notification thread pool (single worker to serialize notifications)
+        self._notification_executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=1, thread_name_prefix="NotificationWorker"
+        )
-        #: If a job has failed, indicates the failure status
-        self.failure_status: JobFailureStatus = None
+        # Server (managed by scheduler)
+        self.server: Optional["Server"] = None
-        # Dependencies
-        self.dependencies: Set[Dependency] = set()  # as target
+    @staticmethod
+    def has_instance() -> bool:
+        """Check if a scheduler instance exists without creating one"""
+        return Scheduler._instance is not None
-        # Watched outputs
-        self.watched_outputs = {}
-        for watched in config.__xpm__.watched_outputs:
-            self.watch_output(watched)
+    @staticmethod
+    def instance() -> "Scheduler":
+        """Get or create the global scheduler instance"""
+        if Scheduler._instance is None:
+            with Scheduler._lock:
+                if Scheduler._instance is None:
+                    Scheduler._instance = Scheduler._create()
+        return Scheduler._instance
-        # Process
-        self._process = None
-        self.unsatisfied = 0
+    @staticmethod
+    def _create(name: str = "Global"):
+        """Internal method to create and start scheduler"""
+        instance = Scheduler(name)
+        instance.start()
+        instance._ready.wait()
+        return instance
-        # Meta-information
-        self.starttime: Optional[float] = None
-        self.submittime: Optional[float] = None
-        self.endtime: Optional[float] = None
-        self._progress: List[LevelInformation] = []
-        self.tags = config.tags()
+    @staticmethod
+    def create(xp: "experiment" = None, name: str = "Global"):
+        """Create or get the scheduler instance
-    def watch_output(self, watched: "WatchedOutput"):
-        """Monitor task outputs
+        Args:
+            xp: (Deprecated) Experiment reference, ignored
+            name: Name for the scheduler (only used on first creation)
-        :param watched: A description of the watched output
+        Returns:
+            The global scheduler instance
         """
-        self.scheduler.xp.watch_output(watched)
-    def task_output_update(self, subpath: Path):
-        """Notification of an updated task output"""
-        if watcher := self.watched_outputs.get(subpath, None):
-            watcher.update()
-    def done_handler(self):
-        """The task has been completed"""
-        for watcher in self.watched_outputs.values():
-            watcher.update()
-    def __str__(self):
-        return "Job[{}]".format(self.identifier)
+        return Scheduler.instance()
-    def wait(self) -> JobState:
-        assert self._future, "Cannot wait a not submitted job"
-        return self._future.result()
+    def register_experiment(self, xp: "experiment"):
+        """Register an experiment with the scheduler"""
+        # Use experiment name as key for now
+        key = xp.workdir.name
+        self.experiments[key] = xp
-    @cached_property
-    def python_path(self) -> Iterator[str]:
-        """Returns an iterator over python path"""
-        return itertools.chain(self.workspace.python_path)
+        logger.debug("Registered experiment %s with scheduler", key)
-    @cached_property
-    def environ(self):
-        """Returns the job environment
+    def unregister_experiment(self, xp: "experiment"):
+        """Unregister an experiment from the scheduler"""
+        key = xp.workdir.name
+        if key in self.experiments:
+            del self.experiments[key]
+            logger.debug("Unregistered experiment %s from scheduler", key)
-        It is made of (by order of priority):
-        1. The job environment
-        1. The launcher environment
-        1. The workspace environment
-        """
-        return ChainMap(
-            {},
-            self.launcher.environ if self.launcher else {},
-            self.workspace.env if self.workspace else {},
-        )
+    def start_server(
+        self, settings: "ServerSettings" = None, workspace: "Workspace" = None
+    ):
+        """Start the notification server (if not already running)
-    @property
-    def progress(self):
-        return self._progress
-    def set_progress(self, level: int, value: float, desc: Optional[str]):
-        if value < 0:
-            logger.warning(f"Progress value out of bounds ({value})")
-            value = 0
-        elif value > 1:
-            logger.warning(f"Progress value out of bounds ({value})")
-            value = 1
-        # Adjust the length of the array
-        self._progress = self._progress[: (level + 1)]
-        while len(self._progress) <= level:
-            self._progress.append(LevelInformation(len(self._progress), None, 0.0))
-        if desc:
-            self._progress[-1].desc = desc
-        self._progress[-1].progress = value
-        for listener in self.scheduler.listeners:
-            listener.job_state(self)
-    def add_notification_server(self, server):
-        """Adds a notification server"""
-        key, baseurl = server.getNotificationSpec()
-        dirpath = self.path / Reporter.NOTIFICATION_FOLDER
-        dirpath.mkdir(exist_ok=True)
-        (dirpath / key).write_text(f"{baseurl}/{self.identifier}")
-    @property
-    def ready(self):
-        return self.state == JobState.READY
-    @property
-    def jobpath(self) -> Path:
-        """Deprecated, use `path`"""
-        return self.workspace.jobspath / self.relpath
-    @property
-    def path(self) -> Path:
-        return self.workspace.jobspath / self.relpath
-    @property
-    def experimaestro_path(self) -> Path:
-        return (self.path / ".experimaestro").resolve()
-    @cached_property
-    def task_outputs_path(self) -> Path:
-        return self.experimaestro_path / "task-outputs.jsonl"
-    @property
-    def relpath(self):
-        identifier = self.config.__xpm__.identifier
-        base = Path(str(self.type.identifier))
-        return base / identifier.all.hex()
-    @property
-    def relmainpath(self):
-        identifier = self.config.__xpm__.identifier
-        base = Path(str(self.type.identifier))
-        return base / identifier.main.hex()
-    @property
-    def hashidentifier(self):
-        return self.config.__xpm__.identifier
-    @property
-    def identifier(self):
-        return self.config.__xpm__.identifier.all.hex()
-    def prepare(self, overwrite=False):
-        """Prepare all files before starting a task
-        :param overwrite: if True, overwrite files even if the task has been run
+        Args:
+            settings: Server settings
+            workspace: Workspace instance (required to get workspace path)
         """
-        pass
-    async def aio_run(self):
-        """Actually run the code"""
-        raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
-    async def aio_process(self) -> Optional["Process"]:
-        """Returns the process if it exists"""
-        raise NotImplementedError("Not implemented")
-    @property
-    def pidpath(self):
-        """This file contains the file PID"""
-        return self.jobpath / ("%s.pid" % self.name)
-    @property
-    def lockpath(self):
-        """This file is used as a lock for running the job"""
-        return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
-    @property
-    def donepath(self) -> Path:
-        """When a job has been successful, this file is written"""
-        return self.jobpath / ("%s.done" % self.name)
-    @property
-    def failedpath(self):
-        """When a job has been unsuccessful, this file is written with an error
-        code inside"""
-        return self.jobpath / ("%s.failed" % self.name)
-    @property
-    def stdout(self) -> Path:
-        return self.jobpath / ("%s.out" % self.name)
-    @property
-    def stderr(self) -> Path:
-        return self.jobpath / ("%s.err" % self.name)
-    @property
-    def basepath(self) -> Path:
-        return self.jobpath / self.name
-    def dependencychanged(self, dependency, oldstatus, status):
-        """Called when a dependency has changed"""
-        def value(s):
-            return 1 if s == DependencyStatus.OK else 0
+        if self.server is None:
+            from experimaestro.server import Server
+            from experimaestro.scheduler.state_provider import WorkspaceStateProvider
-        self.unsatisfied -= value(status) - value(oldstatus)
+            if workspace is None:
+                raise ValueError("workspace parameter is required to start server")
-        logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
-        if status == DependencyStatus.FAIL:
-            # Job completed
-            if not self.state.finished():
-                self.state = JobState.ERROR
-                self.failure_status = JobFailureStatus.DEPENDENCY
-                self._readyEvent.set()
-        if self.unsatisfied == 0:
-            logger.info("Job %s is ready to run", self)
-            # We are ready
-            self.state = JobState.READY
-            self._readyEvent.set()
-    def finalState(self) -> "concurrent.futures.Future[JobState]":
-        assert self._future is not None
-        return self._future
-class JobContext(ConfigWalkContext):
-    def __init__(self, job: Job):
-        super().__init__()
-        self.job = job
-    @property
-    def name(self):
-        return self.job.name
-    @property
-    def path(self):
-        return self.job.path
-    @property
-    def task(self):
-        return self.job.config
-class Listener:
-    def job_submitted(self, job):
-        pass
-    def job_state(self, job):
-        pass
-    def service_add(self, service: Service):
-        """Notify when a new service is added"""
-        pass
-class JobError(Exception):
-    def __init__(self, code):
-        super().__init__(f"Job exited with code {code}")
-class SignalHandler:
-    def __init__(self):
-        self.experiments: Set["experiment"] = set()
-        self.original_sigint_handler = None
-    def add(self, xp: "experiment"):
-        if not self.experiments:
-            self.original_sigint_handler = signal.getsignal(signal.SIGINT)
-            signal.signal(signal.SIGINT, self)
-        self.experiments.add(xp)
-    def remove(self, xp):
-        self.experiments.remove(xp)
-        if not self.experiments:
-            signal.signal(signal.SIGINT, self.original_sigint_handler)
-    def __call__(self, signum, frame):
-        """SIGINT signal handler"""
-        logger.warning("Signal received")
-        for xp in self.experiments:
-            xp.stop()
-SIGNAL_HANDLER = SignalHandler()
-class SchedulerCentral(threading.Thread):
-    loop: asyncio.AbstractEventLoop
-    """The event loop thread used by the scheduler"""
+            # Get the workspace state provider singleton
+            state_provider = WorkspaceStateProvider.get_instance(
+                workspace.path, read_only=False, sync_on_start=False
+            )
-    def __init__(self, name: str):
-        # Daemon thread so it is non blocking
-        super().__init__(name=f"Scheduler EL ({name})", daemon=True)
+            self.server = Server.instance(settings, state_provider)
+            self.server.start()
+            logger.info("Server started by scheduler")
+        else:
+            logger.debug("Server already running")
-        self._ready = threading.Event()
+    def stop_server(self):
+        """Stop the notification server"""
+        if self.server is not None:
+            self.server.stop()
+            logger.info("Server stopped by scheduler")
     def run(self):
+        """Run the event loop forever"""
         logger.debug("Starting event loop thread")
+        # Ported from SchedulerCentral
         self.loop = asyncio.new_event_loop()
         asyncio.set_event_loop(self.loop)
         # Set loop-dependent variables
         self.exitCondition = asyncio.Condition()
         self.dependencyLock = asyncio.Lock()
-        # Start the event loop
+        # Note: State provider removed - now managed at workspace level
+        # Each experiment has its own workspace with database
         self._ready.set()
         self.loop.run_forever()
-    @staticmethod
-    def create(name: str):
-        instance = SchedulerCentral(name)
-        instance.start()
-        instance._ready.wait()
-        return instance
-class Scheduler:
-    """A job scheduler
-    The scheduler is based on asyncio for easy concurrency handling
-    """
-    def __init__(self, xp: "experiment", name: str):
-        # Name of the experiment
-        self.name = name
-        self.xp = xp
-        # Exit mode activated
-        self.exitmode = False
-        # List of all jobs
-        self.jobs: Dict[str, "Job"] = {}
-        # List of jobs
-        self.waitingjobs: Set[Job] = set()
-        # Listeners
-        self.listeners: Set[Listener] = set()
-    @property
-    def loop(self):
-        return self.xp.loop
+    def start_scheduler(self):
+        """Start the scheduler event loop in a thread"""
+        if not self.is_alive():
+            self.start()
+            self._ready.wait()
+        else:
+            logger.warning("Scheduler already started")
     def addlistener(self, listener: Listener):
-        self.listeners.add(listener)
+        with self._listeners_lock:
+            self._listeners.add(listener)
     def removelistener(self, listener: Listener):
-        self.listeners.remove(listener)
+        with self._listeners_lock:
+            self._listeners.discard(listener)
+    def clear_listeners(self):
+        """Clear all listeners (for testing purposes)"""
+        with self._listeners_lock:
+            self._listeners.clear()
     def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
+        # Check if the job belongs to this scheduler
+        if job.identifier not in self.jobs:
+            # If job is not in this scheduler, return its current state directly
+            future = concurrent.futures.Future()
+            future.set_result(job.state)
+            return future
         return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
     async def aio_getjobstate(self, job: Job):
@@ -505,17 +212,25 @@ class Scheduler:
     def submit(self, job: Job) -> Optional[Job]:
         # Wait for the future containing the submitted job
-        logger.debug("Registering the job %s within the scheduler", job)
+        logger.debug("Submit job %s to the scheduler", job)
         otherFuture = asyncio.run_coroutine_threadsafe(
             self.aio_registerJob(job), self.loop
         )
         other = otherFuture.result()
         logger.debug("Job already submitted" if other else "First submission")
-        if other:
-            return other
+        # Only returns if job was already submitted and doesn't need reprocessing
+        if other is not None:
+            # If state is WAITING, it was just reset for resubmission and needs processing
+            # If state is RUNNING or finished (DONE), no need to reprocess
+            if other.state != JobState.WAITING:
+                return other
+            # Use 'other' for resubmission since it has the correct experiments list
+            job = other
         job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
-        return None
+        return other
     def prepare(self, job: Job):
         """Prepares the job for running"""
@@ -530,33 +245,99 @@ class Scheduler:
         if self.exitmode:
             logger.warning("Exit mode: not submitting")
+            return
-        elif job.identifier in self.jobs:
+        # Job was already submitted
+        if job.identifier in self.jobs:
             other = self.jobs[job.identifier]
             assert job.type == other.type
-            if other.state == JobState.ERROR:
+            # Add current experiment to the existing job's experiments list
+            xp = experiment.current()
+            xp.add_job(other)
+            # Copy watched outputs from new job to existing job
+            # This ensures new callbacks are registered even for resubmitted jobs
+            other.watched_outputs.extend(job.watched_outputs)
+            if other.state.is_error():
                 logger.info("Re-submitting job")
+                # Clean up old process info so it will be re-started
+                other._process = None
+                if other.pidpath.is_file():
+                    other.pidpath.unlink()
+                # Use set_state to handle experiment statistics updates
+                other.set_state(JobState.WAITING)
+                self.notify_job_state(other)  # Notify listeners of re-submit
             else:
                 logger.warning("Job %s already submitted", job.identifier)
-                return other
-        else:
-            # Register this job
-            self.xp.unfinishedJobs += 1
-            self.jobs[job.identifier] = job
+            # Returns the previous job
+            return other
+        # Register this job
+        xp = experiment.current()
+        self.jobs[job.identifier] = job
+        # Set submittime now so that add_job can record it in the database
+        # (aio_submit may update this later for re-submitted jobs)
+        job.submittime = time.time()
+        xp.add_job(job)
+        # Set up dependencies
+        for dependency in job.dependencies:
+            dependency.target = job
+            dependency.origin.dependents.add(dependency)
         return None
-    async def aio_submit(self, job: Job) -> JobState:  # noqa: C901
+    def _notify_listeners(self, notification_func, job: Job):
+        """Execute notification in thread pool with error isolation.
+        This runs notifications in a dedicated thread pool to avoid blocking
+        the scheduler and to isolate errors from affecting other listeners.
+        """
+        def _do_notify():
+            # Get a snapshot of listeners with the lock
+            with self._listeners_lock:
+                listeners_snapshot = list(self._listeners)
+            for listener in listeners_snapshot:
+                try:
+                    notification_func(listener, job)
+                except Exception:
+                    logger.exception("Got an error with listener %s", listener)
+        self._notification_executor.submit(_do_notify)
+    def notify_job_submitted(self, job: Job):
+        """Notify the listeners that a job has been submitted"""
+        self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
+    def notify_job_state(self, job: Job):
+        """Notify the listeners that a job has changed state"""
+        self._notify_listeners(lambda lst, j: lst.job_state(j), job)
+    def notify_service_add(self, service: Service):
+        """Notify the listeners that a service has been added"""
+        self._notify_listeners(lambda lst, s: lst.service_add(s), service)
+    async def aio_submit(self, job: Job) -> JobState:
         """Main scheduler function: submit a job, run it (if needed), and returns
         the status code
         """
+        from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
         logger.info("Submitting job %s", job)
-        job._readyEvent = asyncio.Event()
         job.submittime = time.time()
         job.scheduler = self
         self.waitingjobs.add(job)
+        # Register watched outputs now that the job has a scheduler
+        job.register_watched_outputs()
+        # Note: Job metadata will be written after directory is created in aio_start
         # Check that we don't have a completed job in
         # alternate directories
         for jobspath in experiment.current().alt_jobspaths:
@@ -570,560 +351,324 @@ class Scheduler:
             path.unlink()
         path.symlink_to(job.path)
-        job.state = JobState.WAITING
-        for listener in self.listeners:
-            try:
-                listener.job_submitted(job)
-            except Exception:
-                logger.exception("Got an error with listener %s", listener)
-        # Add dependencies, and add to blocking resources
-        if job.dependencies:
-            job.unsatisfied = len(job.dependencies)
-            for dependency in job.dependencies:
-                dependency.target = job
-                dependency.loop = self.loop
-                dependency.origin.dependents.add(dependency)
-                dependency.check()
-        else:
-            job._readyEvent.set()
-            job.state = JobState.READY
+        job.set_state(JobState.WAITING)
+        self.notify_job_submitted(job)
+        # Check if already done
         if job.donepath.exists():
-            job.state = JobState.DONE
+            job.set_state(JobState.DONE)
+            self.notify_job_state(job)  # Notify listeners of done state
         # Check if we have a running process
-        process = await job.aio_process()
-        if process is not None:
-            # Yep! First we notify the listeners
-            job.state = JobState.RUNNING
-            for listener in self.listeners:
-                try:
-                    listener.job_state(job)
-                except Exception:
-                    logger.exception("Got an error with listener %s", listener)
-            # Adds to the listeners
-            if self.xp.server is not None:
-                job.add_notification_server(self.xp.server)
-            # And now, we wait...
-            logger.info("Got a process for job %s - waiting to complete", job)
-            code = await process.aio_code()
-            logger.info("Job %s completed with code %s", job, code)
-            job.state = JobState.DONE if code == 0 else JobState.ERROR
-        # Check if done
-        if job.donepath.exists():
-            job.state = JobState.DONE
-        # OK, not done; let's start the job for real
-        while not job.state.finished():
-            # Wait that the job is ready
-            await job._readyEvent.wait()
-            job._readyEvent.clear()
-            if job.state == JobState.READY:
-                try:
-                    state = await self.aio_start(job)
-                except Exception:
-                    logger.exception("Got an exception while starting the job")
-                    raise
+        if not job.state.finished():
+            process = await job.aio_process()
+            if process is not None:
+                # Notify listeners that job is running
+                job.set_state(JobState.RUNNING)
+                self.notify_job_state(job)
+                # Adds to the listeners
+                if self.server is not None:
+                    job.add_notification_server(self.server)
+                # And now, we wait...
+                logger.info("Got a process for job %s - waiting to complete", job)
+                code = await process.aio_code()
+                logger.info("Job %s completed with code %s", job, code)
+                # Record exit code if available
+                if code is not None:
+                    job.exit_code = code
+                # Read state from .done/.failed files (contains detailed failure reason)
+                state = JobState.from_path(job.path, job.name)
+                # If state is a generic FAILED error, let the process determine
+                # the state (it may detect launcher-specific failures like SLURM timeout)
+                if (
+                    state is not None
+                    and isinstance(state, JobStateError)
+                    and state.failure_reason == JobFailureStatus.FAILED
+                    and code is not None
+                ):
+                    process_state = process.get_job_state(code)
+                    if (
+                        isinstance(process_state, JobStateError)
+                        and process_state.failure_reason != JobFailureStatus.FAILED
+                    ):
+                        # Process detected a more specific failure reason
+                        state = process_state
                 if state is None:
-                    # State is None if this is not the main thread
-                    return JobState.ERROR
-                job.state = state
-        for listener in self.listeners:
+                    if code is not None:
+                        # Fall back to process-specific state detection
+                        state = process.get_job_state(code)
+                    else:
+                        logger.error("No .done or .failed file found for job %s", job)
+                        state = JobState.ERROR
+                # Set endtime before set_state so database gets the timestamp
+                job.endtime = time.time()
+                job.set_state(state)
+                self.notify_job_state(job)  # Notify listeners of final state
+        # If not done or running, start the job
+        if not job.state.finished():
             try:
-                listener.job_state(job)
-            except Exception as e:
-                logger.exception("Listener %s did raise an exception", e)
-        # Job is finished
-        if job.state != JobState.DONE:
-            self.xp.failedJobs[job.identifier] = job
+                state = await self.aio_start(job)
+                # Set endtime before set_state so database gets the timestamp
+                job.endtime = time.time()
+                job.set_state(state)
+            except Exception:
+                logger.exception("Got an exception while starting the job")
+                raise
-        # Process all remaining tasks outputs
-        await asyncThreadcheck("End of job processing", job.done_handler)
+        # Job is finished - experiment statistics already updated by set_state
-        # Decrement the number of unfinished jobs and notify
-        self.xp.unfinishedJobs -= 1
-        async with self.xp.central.exitCondition:
-            logging.debug("Updated number of unfinished jobs")
-            self.xp.central.exitCondition.notify_all()
+        # Write final metadata with end time and final state
+        job.write_metadata()
-        job.endtime = time.time()
         if job in self.waitingjobs:
             self.waitingjobs.remove(job)
-        with job.dependents as dependents:
-            logger.info("Processing %d dependent jobs", len(dependents))
-            for dependency in dependents:
-                logger.debug("Checking dependency %s", dependency)
-                self.loop.call_soon(dependency.check)
+        # Process all remaining task outputs BEFORE notifying exit condition
+        # This ensures taskOutputQueueSize is updated before wait() can check it,
+        # preventing a race where wait() sees both unfinishedJobs==0 and
+        # taskOutputQueueSize==0 before callbacks have been queued.
+        await asyncThreadcheck("End of job processing", job.done_handler)
+        # Now notify - wait() will see the correct taskOutputQueueSize
+        async with self.exitCondition:
+            self.exitCondition.notify_all()
         return job.state
-    async def aio_start(self, job: Job) -> Optional[JobState]:
-        """Start a job
+    async def aio_start(self, job: Job) -> Optional[JobState]:  # noqa: C901
+        """Start a job with full job starting logic
+        This method handles job locking, dependency acquisition, directory setup,
+        and job execution while using the scheduler's coordination lock to prevent
+        race conditions between multiple jobs.
-        Returns None if the dependencies could not be locked after all
-        Returns DONE/ERROR depending on the process outcome
+        :param job: The job to start
+        :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
+            if job completed successfully, JobState.ERROR if job failed during execution,
+            or None (should not occur in normal operation)
+        :raises Exception: Various exceptions during job execution, dependency locking,
+            or process creation
         """
+        from experimaestro.scheduler.jobs import JobStateError
+        from experimaestro.locking import Locks, LockError
+        from experimaestro.scheduler.jobs import JobFailureStatus
-        # We first lock the job before proceeding
+        # Assert preconditions
         assert job.launcher is not None
-        assert self.xp.central is not None
-        with Locks() as locks:
-            logger.debug("[starting] Locking job %s", job)
-            async with job.launcher.connector.lock(job.lockpath):
-                logger.debug("[starting] Locked job %s", job)
+        # Restart loop for resumable tasks that timeout
+        while True:
+            logger.debug(
+                "Starting job %s with %d dependencies",
+                job,
+                len(job.dependencies),
+            )
-                state = None
-                try:
-                    logger.debug(
-                        "Starting job %s with %d dependencies",
-                        job,
-                        len(job.dependencies),
-                    )
+            # Separate static and dynamic dependencies
+            static_deps = [d for d in job.dependencies if not d.is_dynamic()]
+            dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
-                    async with self.xp.central.dependencyLock:
-                        for dependency in job.dependencies:
-                            try:
-                                locks.append(dependency.lock().acquire())
-                            except LockError:
-                                logger.warning(
-                                    "Could not lock %s, aborting start for job %s",
-                                    dependency,
-                                    job,
+            # First, wait for all static dependencies (jobs) to complete
+            # These don't need the dependency lock as they can't change state
+            # Static dependency locks don't need to be added to locks list
+            logger.debug("Waiting for %d static dependencies", len(static_deps))
+            for dependency in static_deps:
+                logger.debug("Waiting for static dependency %s", dependency)
+                try:
+                    await dependency.aio_lock()
+                except RuntimeError as e:
+                    # Dependency failed - mark job as failed due to dependency
+                    logger.warning("Dependency failed: %s", e)
+                    return JobStateError(JobFailureStatus.DEPENDENCY)
+            # We first lock the job before proceeding
+            with Locks() as locks:
+                logger.debug("[starting] Locking job %s", job)
+                async with job.launcher.connector.lock(job.lockpath):
+                    logger.debug("[starting] Locked job %s", job)
+                    state = None
+                    try:
+                        # Now handle dynamic dependencies (tokens) with retry logic
+                        # CRITICAL: Only one task at a time can acquire dynamic dependencies
+                        # to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
+                        # Task B holds Token2 waiting for Token1)
+                        if dynamic_deps:
+                            async with self.dependencyLock:
+                                logger.debug(
+                                    "Locking %d dynamic dependencies (tokens)",
+                                    len(dynamic_deps),
                                 )
-                                dependency.check()
-                                return JobState.WAITING
+                                while True:
+                                    all_locked = True
+                                    for idx, dependency in enumerate(dynamic_deps):
+                                        try:
+                                            # Use timeout=0 for first dependency, 0.1s for subsequent
+                                            timeout = 0 if idx == 0 else 0.1
+                                            # Acquire the lock (this might block on IPC locks)
+                                            lock = await dependency.aio_lock(
+                                                timeout=timeout
+                                            )
+                                            locks.append(lock)
+                                        except LockError:
+                                            logger.info(
+                                                "Could not lock %s, retrying",
+                                                dependency,
+                                            )
+                                            # Release all locks and restart
+                                            for lock in locks.locks:
+                                                lock.release()
+                                            locks.locks.clear()
+                                            # Put failed dependency first
+                                            dynamic_deps.remove(dependency)
+                                            dynamic_deps.insert(0, dependency)
+                                            all_locked = False
+                                            break
+                                    if all_locked:
+                                        # All locks acquired successfully
+                                        break
+                        # Dependencies have been locked, we can start the job
+                        job.starttime = time.time()
+                        # Creates the main directory
+                        directory = job.path
+                        logger.debug("Making directories job %s...", directory)
+                        # Warn about directory cleanup for non-resumable tasks
+                        # (only once per task type)
+                        xpmtype = job.config.__xpmtype__
+                        if (
+                            directory.is_dir()
+                            and not job.resumable
+                            and not xpmtype.warned_clean_not_resumable
+                        ):
+                            xpmtype.warned_clean_not_resumable = True
+                            logger.warning(
+                                "In a future version, directory will be cleaned up for "
+                                "non-resumable tasks (%s). Use ResumableTask if you want "
+                                "to preserve the directory contents.",
+                                xpmtype.identifier,
+                            )
-                    for listener in self.listeners:
-                        listener.job_state(job)
+                        if not directory.is_dir():
+                            directory.mkdir(parents=True, exist_ok=True)
-                    job.starttime = time.time()
+                        # Write metadata with submit and start time (after directory creation)
+                        job.write_metadata()
-                    # Creates the main directory
-                    directory = job.path
-                    logger.debug("Making directories job %s...", directory)
-                    if not directory.is_dir():
-                        directory.mkdir(parents=True, exist_ok=True)
+                        # Sets up the notification URL
+                        if self.server is not None:
+                            job.add_notification_server(self.server)
-                    # Sets up the notification URL
-                    if self.xp.server is not None:
-                        job.add_notification_server(self.xp.server)
+                    except Exception:
+                        logger.warning("Error while locking job", exc_info=True)
+                        return JobState.WAITING
-                except Exception:
-                    logger.warning("Error while locking job", exc_info=True)
-                    return JobState.WAITING
+                    try:
+                        # Runs the job
+                        process = await job.aio_run()
+                    except Exception:
+                        logger.warning("Error while starting job", exc_info=True)
+                        return JobState.ERROR
+                # Wait for job to complete while holding locks
                 try:
-                    # Runs the job
-                    process = await job.aio_run()
-                except Exception:
-                    logger.warning("Error while starting job", exc_info=True)
-                    return JobState.ERROR
-            try:
-                if isinstance(process, JobState):
-                    state = process
-                    logger.debug("Job %s ended (state %s)", job, state)
-                else:
                     logger.debug("Waiting for job %s process to end", job)
                     code = await process.aio_code()
                     logger.debug("Got return code %s for %s", code, job)
-                    # Check the file if there is no return code
-                    if code is None:
-                        # Case where we cannot retrieve the code right away
-                        if job.donepath.is_file():
-                            code = 0
+                    # Record exit code if available
+                    if code is not None:
+                        logger.info("Job %s ended with code %s", job, code)
+                        job.exit_code = code
+                    else:
+                        logger.info("Job %s ended, reading state from files", job)
+                    # Read state from .done/.failed files (contains detailed failure reason)
+                    state = JobState.from_path(job.path, job.name)
+                    # If state is a generic FAILED error, let the process determine
+                    # the state (it may detect launcher-specific failures like SLURM timeout)
+                    if (
+                        state is not None
+                        and isinstance(state, JobStateError)
+                        and state.failure_reason == JobFailureStatus.FAILED
+                        and code is not None
+                    ):
+                        process_state = process.get_job_state(code)
+                        if (
+                            isinstance(process_state, JobStateError)
+                            and process_state.failure_reason != JobFailureStatus.FAILED
+                        ):
+                            # Process detected a more specific failure reason
+                            state = process_state
+                    if state is None:
+                        if code is not None:
+                            # Fall back to process-specific state detection
+                            state = process.get_job_state(code)
                         else:
-                            code = int(job.failedpath.read_text())
-                    logger.debug("Job %s ended with code %s", job, code)
-                    state = JobState.DONE if code == 0 else JobState.ERROR
-            except JobError:
-                logger.warning("Error while running job")
-                state = JobState.ERROR
-            except Exception:
-                logger.warning(
-                    "Error while running job (in experimaestro)", exc_info=True
-                )
-                state = JobState.ERROR
-        return state
-ServiceClass = TypeVar("ServiceClass", bound=Service)
-class experiment:
-    """Main experiment object
-    It is a context object, i.e. experiments is run with
-    ```py
-        with experiment(...) as xp:
-            ...
-    ```
-    """
-    #: Current experiment
-    CURRENT: Optional["experiment"] = None
-    @staticmethod
-    def current() -> "experiment":
-        """Returns the current experiment, but checking first if set
-        If there is no current experiment, raises an AssertError
-        """
-        assert experiment.CURRENT is not None, "No current experiment defined"
-        return experiment.CURRENT
-    def __init__(
-        self,
-        env: Union[Path, str, WorkspaceSettings],
-        name: str,
-        *,
-        host: Optional[str] = None,
-        port: Optional[int] = None,
-        token: Optional[str] = None,
-        run_mode: Optional[RunMode] = None,
-        launcher=None,
-    ):
-        """
-        :param env: an environment -- or a working directory for a local
-            environment
-        :param name: the identifier of the experiment
-        :param launcher: The launcher (if not provided, inferred from path)
-        :param host: The host for the web server (overrides the environment if
-            set)
-        :param port: the port for the web server (overrides the environment if
-            set). Use negative number to avoid running a web server (default when dry run).
-        :param run_mode: The run mode for the experiment (normal, generate run
-            files, dry run)
-        """
-        from experimaestro.server import Server
-        from experimaestro.scheduler import Listener
-        settings = get_settings()
-        if not isinstance(env, WorkspaceSettings):
-            env = WorkspaceSettings(id=None, path=Path(env))
-        # Creates the workspace
-        run_mode = run_mode or RunMode.NORMAL
-        self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
-        # Mark the directory has an experimaestro folder
-        self.workdir = self.workspace.experimentspath / name
-        self.workdir.mkdir(parents=True, exist_ok=True)
-        self.xplockpath = self.workdir / "lock"
-        self.xplock = None
-        self.old_experiment = None
-        self.services: Dict[str, Service] = {}
-        self._job_listener: Optional[Listener] = None
-        # Get configuration settings
-        if host is not None:
-            settings.server.host = host
-        if port is not None:
-            settings.server.port = port
-        if token is not None:
-            settings.server.token = token
-        # Create the scheduler
-        self.scheduler = Scheduler(self, name)
-        self.server = (
-            Server(self.scheduler, settings.server)
-            if (settings.server.port is not None and settings.server.port >= 0)
-            and self.workspace.run_mode == RunMode.NORMAL
-            else None
-        )
-        if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
-            import faulthandler
-            logger.info("Enabling fault handler")
-            faulthandler.enable(all_threads=True)
-    def submit(self, job: Job):
-        return self.scheduler.submit(job)
-    def prepare(self, job: Job):
-        """Generate the file"""
-        return self.scheduler.prepare(job)
-    @property
-    def run_mode(self):
-        return self.workspace.run_mode
-    @property
-    def loop(self):
-        assert self.central is not None
-        return self.central.loop
-    @property
-    def resultspath(self):
-        """Return the directory in which results can be stored for this experiment"""
-        return self.workdir / "results"
-    @property
-    def jobspath(self):
-        """Return the directory in which results can be stored for this experiment"""
-        return self.workdir / "jobs"
-    @property
-    def alt_jobspaths(self):
-        """Return potential other directories"""
-        for alt_workdir in self.workspace.alt_workdirs:
-            yield alt_workdir / "jobs"
-    @property
-    def jobsbakpath(self):
-        """Return the directory in which results can be stored for this experiment"""
-        return self.workdir / "jobs.bak"
-    def stop(self):
-        """Stop the experiment as soon as possible"""
-        async def doStop():
-            assert self.central is not None
-            async with self.central.exitCondition:
-                self.exitMode = True
-                logging.debug("Setting exit mode to true")
-                self.central.exitCondition.notify_all()
-        assert self.central is not None and self.central.loop is not None
-        asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
-    def wait(self):
-        """Wait until the running processes have finished"""
-        async def awaitcompletion():
-            assert self.central is not None
-            logger.debug("Waiting to exit scheduler...")
-            async with self.central.exitCondition:
-                while True:
-                    if self.exitMode:
-                        break
-                    # If we have still unfinished jobs or possible new tasks, wait
-                    logger.debug(
-                        "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
-                        self.unfinishedJobs,
-                        self.taskOutputQueueSize,
-                    )
-                    if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
-                        break
-                    # Wait for more news...
-                    await self.central.exitCondition.wait()
-                if self.failedJobs:
-                    # Show some more information
-                    count = 0
-                    for job in self.failedJobs.values():
-                        if job.failure_status != JobFailureStatus.DEPENDENCY:
-                            count += 1
                             logger.error(
-                                "Job %s failed, check the log file %s",
-                                job.relpath,
-                                job.stderr,
+                                "No .done or .failed file found for job %s", job
                             )
-                    raise FailedExperiment(f"{count} failed jobs")
-        future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
-        return future.result()
-    def setenv(self, name, value, override=True):
-        """Shortcut to set the environment value"""
-        if override or name not in self.workspace.env:
-            logging.info("Setting environment: %s=%s", name, value)
-            self.workspace.env[name] = value
-    def token(self, name: str, count: int):
-        """Returns a token for this experiment
-        The token is the default token of the workspace connector"""
-        return self.workspace.connector.createtoken(name, count)
-    def __enter__(self):
-        from .dynamic_outputs import TaskOutputsWorker
-        if self.workspace.run_mode != RunMode.DRY_RUN:
-            logger.info("Locking experiment %s", self.xplockpath)
-            self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
-            logger.info("Experiment locked")
-        # Move old jobs into "jobs.bak"
-        if self.workspace.run_mode == RunMode.NORMAL:
-            self.jobsbakpath.mkdir(exist_ok=True)
-            for p in self.jobspath.glob("*/*"):
-                if p.is_symlink():
-                    target = self.jobsbakpath / p.relative_to(self.jobspath)
-                    if target.is_symlink():
-                        # Remove if duplicate
-                        p.unlink()
-                    else:
-                        # Rename otherwise
-                        target.parent.mkdir(parents=True, exist_ok=True)
-                        p.rename(target)
-        if self.server:
-            self.server.start()
-        self.workspace.__enter__()
-        (self.workspace.path / ".__experimaestro__").touch()
-        global SIGNAL_HANDLER
-        # Number of unfinished jobs
-        self.unfinishedJobs = 0
-        self.taskOutputQueueSize = 0
-        # List of failed jobs
-        self.failedJobs: Dict[str, Job] = {}
-        # Exit mode when catching signals
-        self.exitMode = False
-        self.central = SchedulerCentral.create(self.scheduler.name)
-        self.taskOutputsWorker = TaskOutputsWorker(self)
-        self.taskOutputsWorker.start()
-        SIGNAL_HANDLER.add(self)
-        self.old_experiment = experiment.CURRENT
-        experiment.CURRENT = self
-        return self
-    def __exit__(self, exc_type, exc_value, traceback):
-        logger.debug("Exiting scheduler context")
-        # If no exception and normal run mode, remove old "jobs"
-        if self.workspace.run_mode == RunMode.NORMAL:
-            if exc_type is None and self.jobsbakpath.is_dir():
-                rmtree(self.jobsbakpath)
-        # Close the different locks
-        try:
-            if exc_type:
-                # import faulthandler
-                # faulthandler.dump_traceback()
-                logger.error(
-                    "Not waiting since an exception was thrown"
-                    " (some jobs may be running)"
-                )
-            else:
-                self.wait()
-        finally:
-            SIGNAL_HANDLER.remove(self)
-            # Stop services
-            for service in self.services.values():
-                logger.info("Closing service %s", service.description())
-                service.stop()
-            if self.central is not None:
-                logger.info("Stopping scheduler event loop")
-                self.central.loop.stop()
-            if self.taskOutputsWorker is not None:
-                logger.info("Stopping tasks outputs worker")
-                self.taskOutputsWorker.queue.put(None)
-            self.central = None
-            self.workspace.__exit__(exc_type, exc_value, traceback)
-            if self.xplock:
-                self.xplock.__exit__(exc_type, exc_value, traceback)
-            # Put back old experiment as current one
-            experiment.CURRENT = self.old_experiment
-            if self.server:
-                logger.info("Stopping web server")
-                self.server.stop()
-        if self.workspace.run_mode == RunMode.NORMAL:
-            # Write the state
-            logging.info("Saving the experiment state")
-            from experimaestro.scheduler.state import ExperimentState
-            ExperimentState.save(
-                self.workdir / "state.json", self.scheduler.jobs.values()
-            )
-    async def update_task_output_count(self, delta: int):
-        """Change in the number of task outputs to process"""
-        async with self.central.exitCondition:
-            self.taskOutputQueueSize += delta
-            logging.debug(
-                "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
-            )
-            if self.taskOutputQueueSize == 0:
-                self.central.exitCondition.notify_all()
-    def watch_output(self, watched: "WatchedOutput"):
-        """Watch an output
-        :param watched: The watched output specification
-        """
-        self.taskOutputsWorker.watch_output(watched)
-    def add_service(self, service: ServiceClass) -> ServiceClass:
-        """Adds a service (e.g. tensorboard viewer) to the experiment
-        :param service: A service instance
-        :return: The same service instance
-        """
-        self.services[service.id] = service
-        for listener in self.scheduler.listeners:
-            listener.service_add(service)
-        return service
+                            state = JobState.ERROR
-    def save(self, obj: Any, name: str = "default"):
-        """Serializes configurations.
+                except JobError:
+                    logger.warning("Error while running job")
+                    state = JobState.ERROR
-        Saves configuration objects within the experimental directory
-        :param obj: The object to save
-        :param name: The name of the saving directory (default to `default`)
-        """
-        if self.workspace.run_mode == RunMode.NORMAL:
-            from experimaestro import save
-            save_dir = self.workdir / "data" / name
-            save_dir.mkdir(exist_ok=True, parents=True)
-            save(obj, save_dir)
-    def load(self, reference: str, name: str = "default"):
-        """Serializes configurations.
-        Loads configuration objects from an experimental directory
-        :param reference: The name of the experiment
-        :param name: The name of the saving directory (default to `default`)
-        """
-        from experimaestro import load
+                except Exception:
+                    logger.warning(
+                        "Error while running job (in experimaestro)", exc_info=True
+                    )
+                    state = JobState.ERROR
+            # Locks are released here after job completes
+            # Check if we should restart a resumable task that timed out
+            from experimaestro.scheduler.jobs import JobStateError
+            if (
+                isinstance(state, JobStateError)
+                and state.failure_reason == JobFailureStatus.TIMEOUT
+                and job.resumable
+            ):
+                job.retry_count += 1
+                if job.retry_count <= job.max_retries:
+                    logger.info(
+                        "Resumable task %s timed out - restarting (attempt %d/%d)",
+                        job,
+                        job.retry_count,
+                        job.max_retries,
+                    )
+                    # Rotate log files to preserve previous run's logs
+                    job.rotate_logs()
+                    # Clear cached process so aio_run() will create a new one
+                    job._process = None
+                    # Delete PID file so the job will be resubmitted
+                    if job.pidpath.exists():
+                        job.pidpath.unlink()
+                    # Continue the loop to restart
+                    continue
+                else:
+                    logger.warning(
+                        "Resumable task %s exceeded max retries (%d), marking as failed",
+                        job,
+                        job.max_retries,
+                    )
+                    # Fall through to return the error state
-        path = self.workspace.experimentspath / reference / "data" / name
-        return load(path)
+            # Job finished (success or non-recoverable error)
+            # Notify scheduler listeners of job state after job completes
+            self.notify_job_state(job)
+            return state

experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

Potentially problematic release.

experimaestro 1.11.1py3-none-any.whl → 2.0.0b4py3-none-any.whl