PyPI - experimaestro - Versions diffs - 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl - Mend

experimaestro 2.0.0a8py3-none-any.whl → 2.0.0b8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show

experimaestro/__init__.py +10 -11
experimaestro/annotations.py +167 -206
experimaestro/cli/__init__.py +278 -7
experimaestro/cli/filter.py +42 -74
experimaestro/cli/jobs.py +157 -106
experimaestro/cli/refactor.py +249 -0
experimaestro/click.py +0 -1
experimaestro/commandline.py +19 -3
experimaestro/connectors/__init__.py +20 -1
experimaestro/connectors/local.py +12 -0
experimaestro/core/arguments.py +182 -46
experimaestro/core/identifier.py +107 -6
experimaestro/core/objects/__init__.py +6 -0
experimaestro/core/objects/config.py +542 -25
experimaestro/core/objects/config_walk.py +20 -0
experimaestro/core/serialization.py +91 -34
experimaestro/core/subparameters.py +164 -0
experimaestro/core/types.py +175 -38
experimaestro/exceptions.py +26 -0
experimaestro/experiments/cli.py +111 -25
experimaestro/generators.py +50 -9
experimaestro/huggingface.py +3 -1
experimaestro/launcherfinder/parser.py +29 -0
experimaestro/launchers/__init__.py +26 -1
experimaestro/launchers/direct.py +12 -0
experimaestro/launchers/slurm/base.py +154 -2
experimaestro/mkdocs/metaloader.py +0 -1
experimaestro/mypy.py +452 -7
experimaestro/notifications.py +63 -13
experimaestro/progress.py +0 -2
experimaestro/rpyc.py +0 -1
experimaestro/run.py +19 -6
experimaestro/scheduler/base.py +510 -125
experimaestro/scheduler/dependencies.py +43 -28
experimaestro/scheduler/dynamic_outputs.py +259 -130
experimaestro/scheduler/experiment.py +256 -31
experimaestro/scheduler/interfaces.py +501 -0
experimaestro/scheduler/jobs.py +216 -206
experimaestro/scheduler/remote/__init__.py +31 -0
experimaestro/scheduler/remote/client.py +874 -0
experimaestro/scheduler/remote/protocol.py +467 -0
experimaestro/scheduler/remote/server.py +423 -0
experimaestro/scheduler/remote/sync.py +144 -0
experimaestro/scheduler/services.py +323 -23
experimaestro/scheduler/state_db.py +437 -0
experimaestro/scheduler/state_provider.py +2766 -0
experimaestro/scheduler/state_sync.py +891 -0
experimaestro/scheduler/workspace.py +52 -10
experimaestro/scriptbuilder.py +7 -0
experimaestro/server/__init__.py +147 -57
experimaestro/server/data/index.css +0 -125
experimaestro/server/data/index.css.map +1 -1
experimaestro/server/data/index.js +194 -58
experimaestro/server/data/index.js.map +1 -1
experimaestro/settings.py +44 -5
experimaestro/sphinx/__init__.py +3 -3
experimaestro/taskglobals.py +20 -0
experimaestro/tests/conftest.py +80 -0
experimaestro/tests/core/test_generics.py +2 -2
experimaestro/tests/identifier_stability.json +45 -0
experimaestro/tests/launchers/bin/sacct +6 -2
experimaestro/tests/launchers/bin/sbatch +4 -2
experimaestro/tests/launchers/test_slurm.py +80 -0
experimaestro/tests/tasks/test_dynamic.py +231 -0
experimaestro/tests/test_cli_jobs.py +615 -0
experimaestro/tests/test_deprecated.py +630 -0
experimaestro/tests/test_environment.py +200 -0
experimaestro/tests/test_file_progress_integration.py +1 -1
experimaestro/tests/test_forward.py +3 -3
experimaestro/tests/test_identifier.py +372 -41
experimaestro/tests/test_identifier_stability.py +458 -0
experimaestro/tests/test_instance.py +3 -3
experimaestro/tests/test_multitoken.py +442 -0
experimaestro/tests/test_mypy.py +433 -0
experimaestro/tests/test_objects.py +312 -5
experimaestro/tests/test_outputs.py +2 -2
experimaestro/tests/test_param.py +8 -12
experimaestro/tests/test_partial_paths.py +231 -0
experimaestro/tests/test_progress.py +0 -48
experimaestro/tests/test_remote_state.py +671 -0
experimaestro/tests/test_resumable_task.py +480 -0
experimaestro/tests/test_serializers.py +141 -1
experimaestro/tests/test_state_db.py +434 -0
experimaestro/tests/test_subparameters.py +160 -0
experimaestro/tests/test_tags.py +136 -0
experimaestro/tests/test_tasks.py +107 -121
experimaestro/tests/test_token_locking.py +252 -0
experimaestro/tests/test_tokens.py +17 -13
experimaestro/tests/test_types.py +123 -1
experimaestro/tests/test_workspace_triggers.py +158 -0
experimaestro/tests/token_reschedule.py +4 -2
experimaestro/tests/utils.py +2 -2
experimaestro/tokens.py +154 -57
experimaestro/tools/diff.py +1 -1
experimaestro/tui/__init__.py +8 -0
experimaestro/tui/app.py +2395 -0
experimaestro/tui/app.tcss +353 -0
experimaestro/tui/log_viewer.py +228 -0
experimaestro/utils/__init__.py +23 -0
experimaestro/utils/environment.py +148 -0
experimaestro/utils/git.py +129 -0
experimaestro/utils/resources.py +1 -1
experimaestro/version.py +34 -0
{experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
experimaestro-2.0.0b8.dist-info/RECORD +187 -0
{experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
experimaestro/compat.py +0 -6
experimaestro/core/objects.pyi +0 -221
experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
experimaestro-2.0.0a8.dist-info/RECORD +0 -166
experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
{experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0

experimaestro/scheduler/dependencies.py CHANGED Viewed

@@ -1,15 +1,11 @@
 """Dependency between tasks and tokens"""
 import threading
-from typing import Optional, Set, TYPE_CHECKING
-import asyncio
+from typing import Set
+from abc import ABC, abstractmethod
 from enum import Enum
-from ..utils import logger
 from ..locking import Lock
-if TYPE_CHECKING:
-    from . import Job
 class Dependents:
     """Encapsulate the access to the dependents"""
@@ -47,32 +43,51 @@ class DependencyStatus(Enum):
     """Dependency won't be availabe in the foreseeable future"""
-class Dependency:
-    # Dependency status
-    loop: asyncio.AbstractEventLoop
+class Dependency(ABC):
+    """Base class for dependencies
+    Static dependencies (like jobs) have a fixed state once resolved - they cannot
+    go from DONE back to WAIT. This is the default behavior.
+    """
     def __init__(self, origin):
-        # Origin and target are two resources
+        # Origin is the resource this dependency points to
         self.origin = origin
-        self.target: Optional["Job"] = None
-        self.currentstatus = DependencyStatus.WAIT
+        # Target will be set by scheduler when registering the job
+        self.target = None
+    def is_dynamic(self) -> bool:
+        """Returns True if this is a dynamic dependency (can change state)"""
+        return False
+    @abstractmethod
+    async def aio_lock(self, timeout: float = 0) -> Lock:
+        """Acquire a lock on this dependency asynchronously
+        Args:
+            timeout: Timeout in seconds (0 = wait indefinitely)
-    def status(self) -> DependencyStatus:
-        raise NotImplementedError()
+        Returns:
+            Lock object
-    def lock(self) -> Lock:
-        raise NotImplementedError()
+        Raises:
+            LockError: If lock cannot be acquired within timeout
+            RuntimeError: If dependency failed
+        """
+        pass
     def __repr__(self) -> str:
-        return "Dep[{origin}->{target}]/{currentstatus}".format(**self.__dict__)
-    def check(self):
-        assert self.target is not None
-        status = self.status()
-        logger.debug("Dependency check: %s", self)
-        if status != self.currentstatus:
-            logger.debug(
-                "Dependency %s is %s (was: %s)", self, status, self.currentstatus
-            )
-            self.target.dependencychanged(self, self.currentstatus, status)
-            self.currentstatus = status
+        return f"Dep[{self.origin}]"
+class DynamicDependency(Dependency):
+    """Base class for dynamic dependencies
+    Dynamic dependencies (like tokens) can change state at any time - availability
+    can go from OK to WAIT and back. These require special handling during lock
+    acquisition with retry logic.
+    """
+    def is_dynamic(self) -> bool:
+        """Returns True - this is a dynamic dependency"""
+        return True

experimaestro/scheduler/dynamic_outputs.py CHANGED Viewed

@@ -1,184 +1,313 @@
-"""Handles dynamic task outputs"""
+"""Handles dynamic task outputs
+This module provides support for tasks that produce dynamic outputs during
+execution. These outputs can trigger callbacks that submit new tasks.
+Key concepts:
+- TaskOutputs: Monitors a task's output file for events
+- TaskOutputsWorker: Processes events and calls registered callbacks
+"""
 import asyncio
 import json
-import logging
 import queue
 import threading
-from collections import defaultdict
-from functools import cached_property
 from pathlib import Path
-from typing import Callable, TYPE_CHECKING
+from typing import Callable, Dict, List, Set, TYPE_CHECKING
 from watchdog.events import FileSystemEventHandler
 from experimaestro.ipc import ipcom
 from experimaestro.utils import logger
-from .base import Job, experiment
 if TYPE_CHECKING:
     from experimaestro.core.objects import WatchedOutput
-class TaskOutputCallbackHandler:
-    def __init__(self, converter: Callable):
-        pass
+    from experimaestro.scheduler.experiment import experiment
+class TaskOutputWatcher:
+    """Watches a specific output method for a configuration within a job"""
+    def __init__(
+        self,
+        key: str,
+        method: Callable,
+        worker: "TaskOutputsWorker",
+    ):
+        self.key = key
+        self.method = method
+        self.worker = worker
+        self.callbacks: Set[Callable] = set()
+        self.processed_events: List[dict] = []
+    def add_callback(self, callback: Callable):
+        """Add a callback and replay any existing events"""
+        # Replay processed events to new callback (don't update count for replays)
+        for event in self.processed_events:
+            self.worker.add(callback, event, update_count=False)
+        self.callbacks.add(callback)
+    def process_event(self, raw_event: dict):
+        """Process a raw event from the task output file"""
+        # Call the method to convert the raw event to a configuration
+        try:
+            # The method signature is: method(dep, *args, **kwargs) -> Config
+            # We need to provide a marker function that marks the output
+            def mark_output(config):
+                """Marker function that just returns the config"""
+                return config
+            result = self.method(mark_output, *raw_event["args"], **raw_event["kwargs"])
+            self.processed_events.append(result)
+            # Dispatch to all callbacks
+            for callback in self.callbacks:
+                self.worker.add(callback, result)
+        except Exception:
+            logger.exception("Error processing task output event")
 class TaskOutputs(FileSystemEventHandler):
-    """Represent and monitors dynamic outputs generated by one task"""
+    """Monitors dynamic outputs generated by one task"""
-    #: Global dictionary for handles
-    HANDLERS: dict[Path, "TaskOutputs"] = {}
+    #: Global dictionary mapping paths to TaskOutputs instances
+    HANDLERS: Dict[Path, "TaskOutputs"] = {}
-    #: Global lock to access current HANDLERS
+    #: Global lock for accessing HANDLERS
     LOCK = threading.Lock()
-    def create(job: Job):
+    @staticmethod
+    def get_or_create(path: Path, worker: "TaskOutputsWorker") -> "TaskOutputs":
+        """Get or create a TaskOutputs instance for the given path"""
         with TaskOutputs.LOCK:
-            if instance := TaskOutputs.get(job.task_outputs_path, None):
+            if path in TaskOutputs.HANDLERS:
+                instance = TaskOutputs.HANDLERS[path]
+                # Update worker reference in case this is a new experiment
+                instance.worker = worker
+                # Clear old watchers - new ones will be added and replay events
+                instance.watchers.clear()
                 return instance
-            instance = TaskOutputs(job.task_outputs_path)
-            TaskOutputs[job.task_outputs_path] = instance
+            instance = TaskOutputs(path, worker)
+            TaskOutputs.HANDLERS[path] = instance
             return instance
-    def __init__(self, path: Path):
-        """Monitors an event path"""
-        logger.debug("Watching dynamic task outputs in %s", path)
+    def __init__(self, path: Path, worker: "TaskOutputsWorker"):
+        """Initialize monitoring for a task output path"""
+        super().__init__()
+        logger.debug("Creating TaskOutputs monitor for %s", path)
         self.path = path
-        self.handle = None
-        self.count = 0
-        self.lock = threading.Lock()
-        self.listeners: dict[str, dict[Callable, set[Callable]]] = defaultdict(
-            lambda: defaultdict(set)
-        )
-        #: The events registered so far
-        self.events = []
-    def __enter__(self):
-        """Starts monitoring task outputs"""
-        self.job.task_outputs_path.parent.mkdir(parents=True, exist_ok=True)
-        with self.lock:
-            if self.handle is None:
-                assert self.count == 0
-                self.handle = ipcom().fswatch(self, self.path.parent, False)
-            self.count += 1
-        return self
-    def __exit__(self, *args):
-        """Stops monitoring task outputs"""
-        with self.lock:
-            self.count -= 1
-            if self.count == 0:
-                ipcom().fsunwatch(self.handle)
-                self.fh.close()
-                self.handle = None
-                self._fh = None
-    def watch_output(self, watched: "WatchedOutput"):
-        """Add a new listener"""
-        key = f"{watched.config.__identifier__}/{watched.method_name}"
-        with self.lock:
-            # Process events so far
-            listener = self.listeners[key].get(watched.method, None)
-            if listener is None:
-                listener = TaskOutputCallbackHandler(watched.method)
-            # Register
-            self.listeners[key][watched.method].add(watched.callback)
-    #
-    # --- Events
-    #
-    @cached_property
-    def fh(self):
-        if self._fh is None:
-            self._fh = self.path.open("rt")
-        return self._fh
+        self.worker = worker
+        self._watch_handle = None
+        self._file_handle = None
+        self._lock = threading.Lock()
+        # Map from key (config_id/method_name) to TaskOutputWatcher
+        self.watchers: Dict[str, TaskOutputWatcher] = {}
+    def start_watching(self):
+        """Start watching the task output file"""
+        logger.debug("Starting to watch task outputs at %s", self.path)
+        with self._lock:
+            if self._watch_handle is not None:
+                return  # Already watching
+            # Ensure the directory exists
+            self.path.parent.mkdir(parents=True, exist_ok=True)
+            # Start file system watching
+            self._watch_handle = ipcom().fswatch(self, self.path.parent, False)
+            logger.debug("Started watching directory %s", self.path.parent)
+            # Process any existing content
+            self._process_file()
+    def stop_watching(self):
+        """Stop watching the task output file"""
+        with self._lock:
+            if self._watch_handle is not None:
+                try:
+                    ipcom().fsunwatch(self._watch_handle)
+                except KeyError:
+                    pass  # Already unwatched
+                self._watch_handle = None
+            if self._file_handle is not None:
+                self._file_handle.close()
+                self._file_handle = None
+    def add_watcher(self, watched: "WatchedOutput"):
+        """Add a watcher for a specific output method"""
+        # Use the identifier from the config - watched.config is actually a Config object
+        # (method.__self__), not a ConfigInformation, despite the type annotation
+        config_id = watched.config.__xpm__.identifier.all.hex()
+        key = f"{config_id}/{watched.method_name}"
+        logger.debug("Adding watcher for key: %s", key)
+        with self._lock:
+            is_new = key not in self.watchers
+            if is_new:
+                self.watchers[key] = TaskOutputWatcher(key, watched.method, self.worker)
+            # If this is a new watcher and the file already exists, replay events from file
+            if is_new and self.path.exists():
+                self._replay_events_for_key(key)
+            self.watchers[key].add_callback(watched.callback)
+    def _replay_events_for_key(self, key: str):
+        """Replay events from the file for a specific key"""
+        if not self.path.exists():
+            return
-    def on_modified(self, event):
-        self.handle(Path(event.src_path))
+        with self.path.open("rt") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
-    def on_created(self, event):
-        self.handle(Path(event.src_path))
+                try:
+                    event = json.loads(line)
+                    if event.get("key") == key:
+                        self.watchers[key].process_event(event)
+                except json.JSONDecodeError:
+                    logger.warning("Invalid JSON in task output: %s", line)
+                except Exception:
+                    logger.exception("Error processing task output line")
-    def handle(self, path: Path):
-        if path != self.path:
+    def _process_file(self):
+        """Process the task output file"""
+        if not self.path.exists():
             return
-        with self.lock:
-            logger.debug("[TASK OUTPUT] Handling task output for %s", self.path)
-            while json_line := self.fh.readline():
-                # Read the event
-                event = json.loads(json_line)
-                logger.debug("Event: %s", event)
-                # FIXME: move elsewhere
-                # # Process the event
-                # event = self.config_method(
-                #     self.job.config.__xpm__.mark_output,
-                #     *event["args"],
-                #     **event["kwargs"],
-                # )
+        if self._file_handle is None:
+            self._file_handle = self.path.open("rt")
+        while line := self._file_handle.readline():
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+                key = event.get("key")
+                if key and key in self.watchers:
+                    self.watchers[key].process_event(event)
+            except json.JSONDecodeError:
+                logger.warning("Invalid JSON in task output: %s", line)
+            except Exception:
+                logger.exception("Error processing task output line")
+    # FileSystemEventHandler methods
+    def on_modified(self, event):
+        if Path(event.src_path) == self.path:
+            with self._lock:
+                self._process_file()
-                self.events.append(event)
-                # self.job.scheduler.xp.taskOutputsWorker.add(self, event)
+    def on_created(self, event):
+        if Path(event.src_path) == self.path:
+            with self._lock:
+                self._process_file()
 class TaskOutputsWorker(threading.Thread):
-    """This worker process dynamic output queue for one experiment"""
+    """Worker thread that processes task output callbacks"""
-    def __init__(self, xp: experiment):
-        super().__init__(name="task outputs worker", daemon=True)
-        self.queue = queue.Queue()
+    def __init__(self, xp: "experiment"):
+        super().__init__(name="task-outputs-worker", daemon=True)
+        self.queue: queue.Queue = queue.Queue()
         self.xp = xp
+        self._monitors: Dict[Path, TaskOutputs] = {}
+        self._lock = threading.Lock()
     def watch_output(self, watched: "WatchedOutput"):
-        """Watch an output
+        """Register a watched output
         :param watched: The watched output specification
         """
-        logger.debug("Registering task output listener %s", watched)
+        # Get the job's task output path
+        job = watched.job
+        if job is None:
+            logger.warning("Cannot watch output without job: %s", watched)
+            return
+        path = job.task_outputs_path
+        logger.debug("Registering task output listener at %s", path)
+        with self._lock:
+            if path not in self._monitors:
+                monitor = TaskOutputs.get_or_create(path, self)
+                self._monitors[path] = monitor
+                monitor.start_watching()
+            else:
+                monitor = self._monitors[path]
+        monitor.add_watcher(watched)
+    def add(self, callback: Callable, event, update_count: bool = True):
+        """Add an event to the processing queue
-        # path = watched.job.tasks_output_path
-        TaskOutputs.create(watched.job).watch_output(watched)
+        :param callback: The callback to call with the event
+        :param event: The event data
+        :param update_count: Whether to update the task output count (False for replays)
+        """
+        if update_count:
+            asyncio.run_coroutine_threadsafe(
+                self.xp.update_task_output_count(1),
+                self.xp.scheduler.loop,
+            ).result()
-    def add(self, watcher, event):
-        asyncio.run_coroutine_threadsafe(
-            self.xp.update_task_output_count(1),
-            self.xp.scheduler.loop,
-        ).result()
-        self.queue.put((watcher, event))
+        self.queue.put((callback, event, update_count))
     def run(self):
-        logging.debug("Starting output listener queue")
+        """Main worker loop"""
+        logger.debug("Starting task outputs worker")
         while True:
-            # Get the next element in the queue
             element = self.queue.get()
             if element is None:
-                # end of processing
+                # Shutdown signal
                 break
-            # Call all the listeners
-            logging.debug("Got one event: %s", element)
-            watcher, event = element
-            for listener in watcher.listeners:
-                try:
-                    logger.debug("Calling listener [%s] with %s", listener, event)
-                    listener(event)
-                    logger.debug(
-                        "[done] Calling listener [%s] with %s", listener, event
-                    )
-                except Exception:
-                    logging.exception("Exception while calling the listener")
+            callback, event, update_count = element
+            try:
+                logger.debug("Calling callback %s with event %s", callback, event)
+                callback(event)
+            except Exception:
+                logger.exception("Error in task output callback")
+            finally:
                 self.queue.task_done()
+                if update_count:
+                    asyncio.run_coroutine_threadsafe(
+                        self.xp.update_task_output_count(-1),
+                        self.xp.scheduler.loop,
+                    ).result()
+        logger.debug("Task outputs worker stopped")
-                asyncio.run_coroutine_threadsafe(
-                    self.xp.update_task_output_count(-1), self.xp.scheduler.loop
-                ).result()
+    def process_job_outputs(self, job) -> None:
+        """Explicitly process any remaining task outputs for a completed job.
+        This is called when a job finishes to ensure all task outputs written
+        by the job are processed before the experiment considers exiting.
+        This is necessary because file system watchers may have latency.
+        :param job: The job that has finished
+        """
+        path = job.task_outputs_path
+        with self._lock:
+            monitor = self._monitors.get(path)
+        if monitor is not None:
+            with monitor._lock:
+                monitor._process_file()
+    def shutdown(self):
+        """Stop the worker and all monitors"""
+        # Stop all monitors
+        with self._lock:
+            for monitor in self._monitors.values():
+                monitor.stop_watching()
+            self._monitors.clear()
+        # Signal the worker to stop
+        self.queue.put(None)

experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

Potentially problematic release.

experimaestro 2.0.0a8py3-none-any.whl → 2.0.0b8py3-none-any.whl