PyPI - experimaestro - Versions diffs - 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl - Mend

experimaestro 2.0.0a8py3-none-any.whl → 2.0.0b8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show

experimaestro/__init__.py +10 -11
experimaestro/annotations.py +167 -206
experimaestro/cli/__init__.py +278 -7
experimaestro/cli/filter.py +42 -74
experimaestro/cli/jobs.py +157 -106
experimaestro/cli/refactor.py +249 -0
experimaestro/click.py +0 -1
experimaestro/commandline.py +19 -3
experimaestro/connectors/__init__.py +20 -1
experimaestro/connectors/local.py +12 -0
experimaestro/core/arguments.py +182 -46
experimaestro/core/identifier.py +107 -6
experimaestro/core/objects/__init__.py +6 -0
experimaestro/core/objects/config.py +542 -25
experimaestro/core/objects/config_walk.py +20 -0
experimaestro/core/serialization.py +91 -34
experimaestro/core/subparameters.py +164 -0
experimaestro/core/types.py +175 -38
experimaestro/exceptions.py +26 -0
experimaestro/experiments/cli.py +111 -25
experimaestro/generators.py +50 -9
experimaestro/huggingface.py +3 -1
experimaestro/launcherfinder/parser.py +29 -0
experimaestro/launchers/__init__.py +26 -1
experimaestro/launchers/direct.py +12 -0
experimaestro/launchers/slurm/base.py +154 -2
experimaestro/mkdocs/metaloader.py +0 -1
experimaestro/mypy.py +452 -7
experimaestro/notifications.py +63 -13
experimaestro/progress.py +0 -2
experimaestro/rpyc.py +0 -1
experimaestro/run.py +19 -6
experimaestro/scheduler/base.py +510 -125
experimaestro/scheduler/dependencies.py +43 -28
experimaestro/scheduler/dynamic_outputs.py +259 -130
experimaestro/scheduler/experiment.py +256 -31
experimaestro/scheduler/interfaces.py +501 -0
experimaestro/scheduler/jobs.py +216 -206
experimaestro/scheduler/remote/__init__.py +31 -0
experimaestro/scheduler/remote/client.py +874 -0
experimaestro/scheduler/remote/protocol.py +467 -0
experimaestro/scheduler/remote/server.py +423 -0
experimaestro/scheduler/remote/sync.py +144 -0
experimaestro/scheduler/services.py +323 -23
experimaestro/scheduler/state_db.py +437 -0
experimaestro/scheduler/state_provider.py +2766 -0
experimaestro/scheduler/state_sync.py +891 -0
experimaestro/scheduler/workspace.py +52 -10
experimaestro/scriptbuilder.py +7 -0
experimaestro/server/__init__.py +147 -57
experimaestro/server/data/index.css +0 -125
experimaestro/server/data/index.css.map +1 -1
experimaestro/server/data/index.js +194 -58
experimaestro/server/data/index.js.map +1 -1
experimaestro/settings.py +44 -5
experimaestro/sphinx/__init__.py +3 -3
experimaestro/taskglobals.py +20 -0
experimaestro/tests/conftest.py +80 -0
experimaestro/tests/core/test_generics.py +2 -2
experimaestro/tests/identifier_stability.json +45 -0
experimaestro/tests/launchers/bin/sacct +6 -2
experimaestro/tests/launchers/bin/sbatch +4 -2
experimaestro/tests/launchers/test_slurm.py +80 -0
experimaestro/tests/tasks/test_dynamic.py +231 -0
experimaestro/tests/test_cli_jobs.py +615 -0
experimaestro/tests/test_deprecated.py +630 -0
experimaestro/tests/test_environment.py +200 -0
experimaestro/tests/test_file_progress_integration.py +1 -1
experimaestro/tests/test_forward.py +3 -3
experimaestro/tests/test_identifier.py +372 -41
experimaestro/tests/test_identifier_stability.py +458 -0
experimaestro/tests/test_instance.py +3 -3
experimaestro/tests/test_multitoken.py +442 -0
experimaestro/tests/test_mypy.py +433 -0
experimaestro/tests/test_objects.py +312 -5
experimaestro/tests/test_outputs.py +2 -2
experimaestro/tests/test_param.py +8 -12
experimaestro/tests/test_partial_paths.py +231 -0
experimaestro/tests/test_progress.py +0 -48
experimaestro/tests/test_remote_state.py +671 -0
experimaestro/tests/test_resumable_task.py +480 -0
experimaestro/tests/test_serializers.py +141 -1
experimaestro/tests/test_state_db.py +434 -0
experimaestro/tests/test_subparameters.py +160 -0
experimaestro/tests/test_tags.py +136 -0
experimaestro/tests/test_tasks.py +107 -121
experimaestro/tests/test_token_locking.py +252 -0
experimaestro/tests/test_tokens.py +17 -13
experimaestro/tests/test_types.py +123 -1
experimaestro/tests/test_workspace_triggers.py +158 -0
experimaestro/tests/token_reschedule.py +4 -2
experimaestro/tests/utils.py +2 -2
experimaestro/tokens.py +154 -57
experimaestro/tools/diff.py +1 -1
experimaestro/tui/__init__.py +8 -0
experimaestro/tui/app.py +2395 -0
experimaestro/tui/app.tcss +353 -0
experimaestro/tui/log_viewer.py +228 -0
experimaestro/utils/__init__.py +23 -0
experimaestro/utils/environment.py +148 -0
experimaestro/utils/git.py +129 -0
experimaestro/utils/resources.py +1 -1
experimaestro/version.py +34 -0
{experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
experimaestro-2.0.0b8.dist-info/RECORD +187 -0
{experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
experimaestro/compat.py +0 -6
experimaestro/core/objects.pyi +0 -221
experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
experimaestro-2.0.0a8.dist-info/RECORD +0 -166
experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
{experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0

experimaestro/scheduler/base.py CHANGED Viewed

@@ -1,15 +1,16 @@
-import logging
 import threading
 import time
 from typing import (
     Optional,
     Set,
+    ClassVar,
+    TYPE_CHECKING,
 )
 import asyncio
 from typing import Dict
 from experimaestro.scheduler import experiment
-from experimaestro.scheduler.jobs import Job, JobState
+from experimaestro.scheduler.jobs import Job, JobState, JobError
 from experimaestro.scheduler.services import Service
@@ -17,6 +18,11 @@ from experimaestro.utils import logger
 from experimaestro.utils.asyncio import asyncThreadcheck
 import concurrent.futures
+if TYPE_CHECKING:
+    from experimaestro.server import Server
+    from experimaestro.settings import ServerSettings
+    from experimaestro.scheduler.workspace import Workspace
 class Listener:
     def job_submitted(self, job):
@@ -31,18 +37,24 @@ class Listener:
 class Scheduler(threading.Thread):
-    """A job scheduler
+    """A job scheduler (singleton)
-    The scheduler is based on asyncio for easy concurrency handling
+    The scheduler is based on asyncio for easy concurrency handling.
+    This is a singleton - only one scheduler instance exists per process.
     """
-    def __init__(self, xp: "experiment", name: str):
+    _instance: ClassVar[Optional["Scheduler"]] = None
+    _lock: ClassVar[threading.Lock] = threading.Lock()
+    def __init__(self, name: str = "Global"):
         super().__init__(name=f"Scheduler ({name})", daemon=True)
         self._ready = threading.Event()
-        # Name of the experiment
+        # Name of the scheduler
         self.name = name
-        self.xp = xp
+        # Track experiments (simple dict for now)
+        self.experiments: Dict[str, "experiment"] = {}
         # Exit mode activated
         self.exitmode = False
@@ -53,16 +65,101 @@ class Scheduler(threading.Thread):
         # List of jobs
         self.waitingjobs: Set[Job] = set()
-        # Listeners
-        self.listeners: Set[Listener] = set()
+        # Listeners with thread-safe access
+        self._listeners: Set[Listener] = set()
+        self._listeners_lock = threading.Lock()
+        # Notification thread pool (single worker to serialize notifications)
+        self._notification_executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=1, thread_name_prefix="NotificationWorker"
+        )
+        # Server (managed by scheduler)
+        self.server: Optional["Server"] = None
     @staticmethod
-    def create(xp: "experiment", name: str):
-        instance = Scheduler(xp, name)
+    def has_instance() -> bool:
+        """Check if a scheduler instance exists without creating one"""
+        return Scheduler._instance is not None
+    @staticmethod
+    def instance() -> "Scheduler":
+        """Get or create the global scheduler instance"""
+        if Scheduler._instance is None:
+            with Scheduler._lock:
+                if Scheduler._instance is None:
+                    Scheduler._instance = Scheduler._create()
+        return Scheduler._instance
+    @staticmethod
+    def _create(name: str = "Global"):
+        """Internal method to create and start scheduler"""
+        instance = Scheduler(name)
         instance.start()
         instance._ready.wait()
         return instance
+    @staticmethod
+    def create(xp: "experiment" = None, name: str = "Global"):
+        """Create or get the scheduler instance
+        Args:
+            xp: (Deprecated) Experiment reference, ignored
+            name: Name for the scheduler (only used on first creation)
+        Returns:
+            The global scheduler instance
+        """
+        return Scheduler.instance()
+    def register_experiment(self, xp: "experiment"):
+        """Register an experiment with the scheduler"""
+        # Use experiment name as key for now
+        key = xp.workdir.name
+        self.experiments[key] = xp
+        logger.debug("Registered experiment %s with scheduler", key)
+    def unregister_experiment(self, xp: "experiment"):
+        """Unregister an experiment from the scheduler"""
+        key = xp.workdir.name
+        if key in self.experiments:
+            del self.experiments[key]
+            logger.debug("Unregistered experiment %s from scheduler", key)
+    def start_server(
+        self, settings: "ServerSettings" = None, workspace: "Workspace" = None
+    ):
+        """Start the notification server (if not already running)
+        Args:
+            settings: Server settings
+            workspace: Workspace instance (required to get workspace path)
+        """
+        if self.server is None:
+            from experimaestro.server import Server
+            from experimaestro.scheduler.state_provider import WorkspaceStateProvider
+            if workspace is None:
+                raise ValueError("workspace parameter is required to start server")
+            # Get the workspace state provider singleton
+            state_provider = WorkspaceStateProvider.get_instance(
+                workspace.path, read_only=False, sync_on_start=False
+            )
+            self.server = Server.instance(settings, state_provider)
+            self.server.start()
+            logger.info("Server started by scheduler")
+        else:
+            logger.debug("Server already running")
+    def stop_server(self):
+        """Stop the notification server"""
+        if self.server is not None:
+            self.server.stop()
+            logger.info("Server stopped by scheduler")
     def run(self):
         """Run the event loop forever"""
         logger.debug("Starting event loop thread")
@@ -72,6 +169,10 @@ class Scheduler(threading.Thread):
         # Set loop-dependent variables
         self.exitCondition = asyncio.Condition()
         self.dependencyLock = asyncio.Lock()
+        # Note: State provider removed - now managed at workspace level
+        # Each experiment has its own workspace with database
         self._ready.set()
         self.loop.run_forever()
@@ -84,10 +185,38 @@ class Scheduler(threading.Thread):
             logger.warning("Scheduler already started")
     def addlistener(self, listener: Listener):
-        self.listeners.add(listener)
+        with self._listeners_lock:
+            self._listeners.add(listener)
     def removelistener(self, listener: Listener):
-        self.listeners.remove(listener)
+        with self._listeners_lock:
+            self._listeners.discard(listener)
+    def clear_listeners(self):
+        """Clear all listeners (for testing purposes)"""
+        with self._listeners_lock:
+            self._listeners.clear()
+    def wait_for_notifications(self, timeout: float = 5.0) -> bool:
+        """Wait for all pending notifications to be processed.
+        This submits a sentinel task and waits for it to complete,
+        ensuring all previously submitted notifications have been processed.
+        Args:
+            timeout: Maximum time to wait in seconds
+        Returns:
+            True if all notifications were processed, False if timeout occurred
+        """
+        try:
+            # Submit a no-op and wait for it to complete
+            future = self._notification_executor.submit(lambda: None)
+            future.result(timeout=timeout)
+            return True
+        except concurrent.futures.TimeoutError:
+            logger.warning("Timeout waiting for notification queue to drain")
+            return False
     def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
         # Check if the job belongs to this scheduler
@@ -104,17 +233,25 @@ class Scheduler(threading.Thread):
     def submit(self, job: Job) -> Optional[Job]:
         # Wait for the future containing the submitted job
-        logger.debug("Registering the job %s within the scheduler", job)
+        logger.debug("Submit job %s to the scheduler", job)
         otherFuture = asyncio.run_coroutine_threadsafe(
             self.aio_registerJob(job), self.loop
         )
         other = otherFuture.result()
         logger.debug("Job already submitted" if other else "First submission")
-        if other:
-            return other
+        # Only returns if job was already submitted and doesn't need reprocessing
+        if other is not None:
+            # If state is WAITING, it was just reset for resubmission and needs processing
+            # If state is RUNNING or finished (DONE), no need to reprocess
+            if other.state != JobState.WAITING:
+                return other
+            # Use 'other' for resubmission since it has the correct experiments list
+            job = other
         job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
-        return None
+        return other
     def prepare(self, job: Job):
         """Prepares the job for running"""
@@ -129,49 +266,99 @@ class Scheduler(threading.Thread):
         if self.exitmode:
             logger.warning("Exit mode: not submitting")
+            return
-        elif job.identifier in self.jobs:
+        # Job was already submitted
+        if job.identifier in self.jobs:
             other = self.jobs[job.identifier]
             assert job.type == other.type
-            if other.state == JobState.ERROR:
+            # Add current experiment to the existing job's experiments list
+            xp = experiment.current()
+            xp.add_job(other)
+            # Copy watched outputs from new job to existing job
+            # This ensures new callbacks are registered even for resubmitted jobs
+            other.watched_outputs.extend(job.watched_outputs)
+            if other.state.is_error():
                 logger.info("Re-submitting job")
+                # Clean up old process info so it will be re-started
+                other._process = None
+                if other.pidpath.is_file():
+                    other.pidpath.unlink()
+                # Use set_state to handle experiment statistics updates
+                other.set_state(JobState.WAITING)
+                self.notify_job_state(other)  # Notify listeners of re-submit
             else:
                 logger.warning("Job %s already submitted", job.identifier)
-                return other
-        else:
-            # Register this job
-            self.xp.unfinishedJobs += 1
-            self.jobs[job.identifier] = job
+            # Returns the previous job
+            return other
+        # Register this job
+        xp = experiment.current()
+        self.jobs[job.identifier] = job
+        # Set submittime now so that add_job can record it in the database
+        # (aio_submit may update this later for re-submitted jobs)
+        job.submittime = time.time()
+        xp.add_job(job)
+        # Set up dependencies
+        for dependency in job.dependencies:
+            dependency.target = job
+            dependency.origin.dependents.add(dependency)
         return None
+    def _notify_listeners(self, notification_func, job: Job):
+        """Execute notification in thread pool with error isolation.
+        This runs notifications in a dedicated thread pool to avoid blocking
+        the scheduler and to isolate errors from affecting other listeners.
+        """
+        def _do_notify():
+            # Get a snapshot of listeners with the lock
+            with self._listeners_lock:
+                listeners_snapshot = list(self._listeners)
+            for listener in listeners_snapshot:
+                try:
+                    notification_func(listener, job)
+                except Exception:
+                    logger.exception("Got an error with listener %s", listener)
+        self._notification_executor.submit(_do_notify)
     def notify_job_submitted(self, job: Job):
         """Notify the listeners that a job has been submitted"""
-        for listener in self.listeners:
-            try:
-                listener.job_submitted(job)
-            except Exception:
-                logger.exception("Got an error with listener %s", listener)
+        self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
     def notify_job_state(self, job: Job):
         """Notify the listeners that a job has changed state"""
-        for listener in self.listeners:
-            try:
-                listener.job_state(job)
-            except Exception:
-                logger.exception("Got an error with listener %s", listener)
+        self._notify_listeners(lambda lst, j: lst.job_state(j), job)
+    def notify_service_add(self, service: Service):
+        """Notify the listeners that a service has been added"""
+        self._notify_listeners(lambda lst, s: lst.service_add(s), service)
-    async def aio_submit(self, job: Job) -> JobState:  # noqa: C901
+    async def aio_submit(self, job: Job) -> JobState:
         """Main scheduler function: submit a job, run it (if needed), and returns
         the status code
         """
+        from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
         logger.info("Submitting job %s", job)
-        job._readyEvent = asyncio.Event()
         job.submittime = time.time()
         job.scheduler = self
         self.waitingjobs.add(job)
+        # Register watched outputs now that the job has a scheduler
+        job.register_watched_outputs()
+        # Note: Job metadata will be written after directory is created in aio_start
         # Check that we don't have a completed job in
         # alternate directories
         for jobspath in experiment.current().alt_jobspaths:
@@ -185,126 +372,324 @@ class Scheduler(threading.Thread):
             path.unlink()
         path.symlink_to(job.path)
-        job.state = JobState.WAITING
+        job.set_state(JobState.WAITING)
         self.notify_job_submitted(job)
-        # Add dependencies, and add to blocking resources
-        if job.dependencies:
-            job.unsatisfied = len(job.dependencies)
-            for dependency in job.dependencies:
-                dependency.target = job
-                dependency.loop = self.loop
-                dependency.origin.dependents.add(dependency)
-                dependency.check()
-        else:
-            job._readyEvent.set()
-            job.state = JobState.READY
+        # Check if already done
         if job.donepath.exists():
-            job.state = JobState.DONE
+            job.set_state(JobState.DONE)
+            self.notify_job_state(job)  # Notify listeners of done state
         # Check if we have a running process
-        process = await job.aio_process()
-        if process is not None:
-            # Yep! First we notify the listeners
-            job.state = JobState.RUNNING
-            # Notify the listeners
-            self.notify_job_state(job)
-            # Adds to the listeners
-            if self.xp.server is not None:
-                job.add_notification_server(self.xp.server)
-            # And now, we wait...
-            logger.info("Got a process for job %s - waiting to complete", job)
-            code = await process.aio_code()
-            logger.info("Job %s completed with code %s", job, code)
-            job.state = JobState.DONE if code == 0 else JobState.ERROR
-        # Check if done
-        if job.donepath.exists():
-            job.state = JobState.DONE
-        # OK, not done; let's start the job for real
-        while not job.state.finished():
-            # Wait that the job is ready
-            await job._readyEvent.wait()
-            job._readyEvent.clear()
-            if job.state == JobState.READY:
-                try:
-                    state = await self.aio_start(job)
-                except Exception:
-                    logger.exception("Got an exception while starting the job")
-                    raise
+        if not job.state.finished():
+            process = await job.aio_process()
+            if process is not None:
+                # Notify listeners that job is running
+                job.set_state(JobState.RUNNING)
+                self.notify_job_state(job)
+                # Adds to the listeners
+                if self.server is not None:
+                    job.add_notification_server(self.server)
+                # And now, we wait...
+                logger.info("Got a process for job %s - waiting to complete", job)
+                code = await process.aio_code()
+                logger.info("Job %s completed with code %s", job, code)
+                # Record exit code if available
+                if code is not None:
+                    job.exit_code = code
+                # Read state from .done/.failed files (contains detailed failure reason)
+                state = JobState.from_path(job.path, job.name)
+                # If state is a generic FAILED error, let the process determine
+                # the state (it may detect launcher-specific failures like SLURM timeout)
+                if (
+                    state is not None
+                    and isinstance(state, JobStateError)
+                    and state.failure_reason == JobFailureStatus.FAILED
+                    and code is not None
+                ):
+                    process_state = process.get_job_state(code)
+                    if (
+                        isinstance(process_state, JobStateError)
+                        and process_state.failure_reason != JobFailureStatus.FAILED
+                    ):
+                        # Process detected a more specific failure reason
+                        state = process_state
                 if state is None:
-                    # State is None if this is not the main thread
-                    return JobState.ERROR
+                    if code is not None:
+                        # Fall back to process-specific state detection
+                        state = process.get_job_state(code)
+                    else:
+                        logger.error("No .done or .failed file found for job %s", job)
+                        state = JobState.ERROR
+                # Set endtime before set_state so database gets the timestamp
+                job.endtime = time.time()
+                job.set_state(state)
+                self.notify_job_state(job)  # Notify listeners of final state
+        # If not done or running, start the job
+        if not job.state.finished():
+            try:
+                state = await self.aio_start(job)
+                # Set endtime before set_state so database gets the timestamp
+                job.endtime = time.time()
+                job.set_state(state)
+            except Exception:
+                logger.exception("Got an exception while starting the job")
+                raise
-                job.state = state
+        # Job is finished - experiment statistics already updated by set_state
-        self.notify_job_state(job)
+        # Write final metadata with end time and final state
+        job.write_metadata()
-        # Job is finished
-        if job.state != JobState.DONE:
-            self.xp.failedJobs[job.identifier] = job
+        if job in self.waitingjobs:
+            self.waitingjobs.remove(job)
-        # Process all remaining tasks outputs
+        # Process all remaining task outputs BEFORE notifying exit condition
+        # This ensures taskOutputQueueSize is updated before wait() can check it,
+        # preventing a race where wait() sees both unfinishedJobs==0 and
+        # taskOutputQueueSize==0 before callbacks have been queued.
         await asyncThreadcheck("End of job processing", job.done_handler)
-        # Decrement the number of unfinished jobs and notify
-        self.xp.unfinishedJobs -= 1
+        # Now notify - wait() will see the correct taskOutputQueueSize
         async with self.exitCondition:
-            logging.debug("Updated number of unfinished jobs")
             self.exitCondition.notify_all()
-        job.endtime = time.time()
-        if job in self.waitingjobs:
-            self.waitingjobs.remove(job)
-        with job.dependents as dependents:
-            logger.info("Processing %d dependent jobs", len(dependents))
-            for dependency in dependents:
-                logger.debug("Checking dependency %s", dependency)
-                self.loop.call_soon(dependency.check)
         return job.state
-    async def aio_start(self, job: Job) -> Optional[JobState]:
-        """Start a job (scheduler coordination layer)
+    async def aio_start(self, job: Job) -> Optional[JobState]:  # noqa: C901
+        """Start a job with full job starting logic
-        This method serves as a coordination layer that delegates the actual
-        job starting logic to the job itself while handling scheduler-specific
-        concerns like state notifications and providing coordination context.
+        This method handles job locking, dependency acquisition, directory setup,
+        and job execution while using the scheduler's coordination lock to prevent
+        race conditions between multiple jobs.
         :param job: The job to start
         :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
             if job completed successfully, JobState.ERROR if job failed during execution,
             or None (should not occur in normal operation)
-        :raises Exception: Various exceptions during scheduler coordination
+        :raises Exception: Various exceptions during job execution, dependency locking,
+            or process creation
         """
+        from experimaestro.scheduler.jobs import JobStateError
+        from experimaestro.locking import Locks, LockError
+        from experimaestro.scheduler.jobs import JobFailureStatus
         # Assert preconditions
         assert job.launcher is not None
-        try:
-            # Call job's start method with scheduler context
-            state = await job.aio_start(
-                sched_dependency_lock=self.dependencyLock,
-                notification_server=self.xp.server if self.xp else None,
+        # Restart loop for resumable tasks that timeout
+        while True:
+            logger.debug(
+                "Starting job %s with %d dependencies",
+                job,
+                len(job.dependencies),
             )
-            if state is None:
-                # Dependencies couldn't be locked, return WAITING state
-                return JobState.WAITING
+            # Separate static and dynamic dependencies
+            static_deps = [d for d in job.dependencies if not d.is_dynamic()]
+            dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
-            # Notify scheduler listeners of job state after successful start
+            # First, wait for all static dependencies (jobs) to complete
+            # These don't need the dependency lock as they can't change state
+            # Static dependency locks don't need to be added to locks list
+            logger.debug("Waiting for %d static dependencies", len(static_deps))
+            for dependency in static_deps:
+                logger.debug("Waiting for static dependency %s", dependency)
+                try:
+                    await dependency.aio_lock()
+                except RuntimeError as e:
+                    # Dependency failed - mark job as failed due to dependency
+                    logger.warning("Dependency failed: %s", e)
+                    return JobStateError(JobFailureStatus.DEPENDENCY)
+            # We first lock the job before proceeding
+            with Locks() as locks:
+                logger.debug("[starting] Locking job %s", job)
+                async with job.launcher.connector.lock(job.lockpath):
+                    logger.debug("[starting] Locked job %s", job)
+                    state = None
+                    try:
+                        # Now handle dynamic dependencies (tokens) with retry logic
+                        # CRITICAL: Only one task at a time can acquire dynamic dependencies
+                        # to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
+                        # Task B holds Token2 waiting for Token1)
+                        if dynamic_deps:
+                            async with self.dependencyLock:
+                                logger.debug(
+                                    "Locking %d dynamic dependencies (tokens)",
+                                    len(dynamic_deps),
+                                )
+                                while True:
+                                    all_locked = True
+                                    for idx, dependency in enumerate(dynamic_deps):
+                                        try:
+                                            # Use timeout=0 for first dependency, 0.1s for subsequent
+                                            timeout = 0 if idx == 0 else 0.1
+                                            # Acquire the lock (this might block on IPC locks)
+                                            lock = await dependency.aio_lock(
+                                                timeout=timeout
+                                            )
+                                            locks.append(lock)
+                                        except LockError:
+                                            logger.info(
+                                                "Could not lock %s, retrying",
+                                                dependency,
+                                            )
+                                            # Release all locks and restart
+                                            for lock in locks.locks:
+                                                lock.release()
+                                            locks.locks.clear()
+                                            # Put failed dependency first
+                                            dynamic_deps.remove(dependency)
+                                            dynamic_deps.insert(0, dependency)
+                                            all_locked = False
+                                            break
+                                    if all_locked:
+                                        # All locks acquired successfully
+                                        break
+                        # Dependencies have been locked, we can start the job
+                        job.starttime = time.time()
+                        # Creates the main directory
+                        directory = job.path
+                        logger.debug("Making directories job %s...", directory)
+                        # Warn about directory cleanup for non-resumable tasks
+                        # (only once per task type)
+                        xpmtype = job.config.__xpmtype__
+                        if (
+                            directory.is_dir()
+                            and not job.resumable
+                            and not xpmtype.warned_clean_not_resumable
+                        ):
+                            xpmtype.warned_clean_not_resumable = True
+                            logger.warning(
+                                "In a future version, directory will be cleaned up for "
+                                "non-resumable tasks (%s). Use ResumableTask if you want "
+                                "to preserve the directory contents.",
+                                xpmtype.identifier,
+                            )
+                        if not directory.is_dir():
+                            directory.mkdir(parents=True, exist_ok=True)
+                        # Write metadata with submit and start time (after directory creation)
+                        job.write_metadata()
+                        # Sets up the notification URL
+                        if self.server is not None:
+                            job.add_notification_server(self.server)
+                    except Exception:
+                        logger.warning("Error while locking job", exc_info=True)
+                        return JobState.WAITING
+                    try:
+                        # Runs the job
+                        process = await job.aio_run()
+                    except Exception:
+                        logger.warning("Error while starting job", exc_info=True)
+                        return JobState.ERROR
+                # Wait for job to complete while holding locks
+                try:
+                    logger.debug("Waiting for job %s process to end", job)
+                    code = await process.aio_code()
+                    logger.debug("Got return code %s for %s", code, job)
+                    # Record exit code if available
+                    if code is not None:
+                        logger.info("Job %s ended with code %s", job, code)
+                        job.exit_code = code
+                    else:
+                        logger.info("Job %s ended, reading state from files", job)
+                    # Read state from .done/.failed files (contains detailed failure reason)
+                    state = JobState.from_path(job.path, job.name)
+                    # If state is a generic FAILED error, let the process determine
+                    # the state (it may detect launcher-specific failures like SLURM timeout)
+                    if (
+                        state is not None
+                        and isinstance(state, JobStateError)
+                        and state.failure_reason == JobFailureStatus.FAILED
+                        and code is not None
+                    ):
+                        process_state = process.get_job_state(code)
+                        if (
+                            isinstance(process_state, JobStateError)
+                            and process_state.failure_reason != JobFailureStatus.FAILED
+                        ):
+                            # Process detected a more specific failure reason
+                            state = process_state
+                    if state is None:
+                        if code is not None:
+                            # Fall back to process-specific state detection
+                            state = process.get_job_state(code)
+                        else:
+                            logger.error(
+                                "No .done or .failed file found for job %s", job
+                            )
+                            state = JobState.ERROR
+                except JobError:
+                    logger.warning("Error while running job")
+                    state = JobState.ERROR
+                except Exception:
+                    logger.warning(
+                        "Error while running job (in experimaestro)", exc_info=True
+                    )
+                    state = JobState.ERROR
+            # Locks are released here after job completes
+            # Check if we should restart a resumable task that timed out
+            from experimaestro.scheduler.jobs import JobStateError
+            if (
+                isinstance(state, JobStateError)
+                and state.failure_reason == JobFailureStatus.TIMEOUT
+                and job.resumable
+            ):
+                job.retry_count += 1
+                if job.retry_count <= job.max_retries:
+                    logger.info(
+                        "Resumable task %s timed out - restarting (attempt %d/%d)",
+                        job,
+                        job.retry_count,
+                        job.max_retries,
+                    )
+                    # Rotate log files to preserve previous run's logs
+                    job.rotate_logs()
+                    # Clear cached process so aio_run() will create a new one
+                    job._process = None
+                    # Delete PID file so the job will be resubmitted
+                    if job.pidpath.exists():
+                        job.pidpath.unlink()
+                    # Continue the loop to restart
+                    continue
+                else:
+                    logger.warning(
+                        "Resumable task %s exceeded max retries (%d), marking as failed",
+                        job,
+                        job.max_retries,
+                    )
+                    # Fall through to return the error state
+            # Job finished (success or non-recoverable error)
+            # Notify scheduler listeners of job state after job completes
             self.notify_job_state(job)
             return state
-        except Exception:
-            logger.warning("Error in scheduler job coordination", exc_info=True)
-            return JobState.ERROR

experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

Potentially problematic release.

experimaestro 2.0.0a8py3-none-any.whl → 2.0.0b8py3-none-any.whl