experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +278 -7
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +111 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +510 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +256 -31
  37. experimaestro/scheduler/interfaces.py +501 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/remote/__init__.py +31 -0
  40. experimaestro/scheduler/remote/client.py +874 -0
  41. experimaestro/scheduler/remote/protocol.py +467 -0
  42. experimaestro/scheduler/remote/server.py +423 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +323 -23
  45. experimaestro/scheduler/state_db.py +437 -0
  46. experimaestro/scheduler/state_provider.py +2766 -0
  47. experimaestro/scheduler/state_sync.py +891 -0
  48. experimaestro/scheduler/workspace.py +52 -10
  49. experimaestro/scriptbuilder.py +7 -0
  50. experimaestro/server/__init__.py +147 -57
  51. experimaestro/server/data/index.css +0 -125
  52. experimaestro/server/data/index.css.map +1 -1
  53. experimaestro/server/data/index.js +194 -58
  54. experimaestro/server/data/index.js.map +1 -1
  55. experimaestro/settings.py +44 -5
  56. experimaestro/sphinx/__init__.py +3 -3
  57. experimaestro/taskglobals.py +20 -0
  58. experimaestro/tests/conftest.py +80 -0
  59. experimaestro/tests/core/test_generics.py +2 -2
  60. experimaestro/tests/identifier_stability.json +45 -0
  61. experimaestro/tests/launchers/bin/sacct +6 -2
  62. experimaestro/tests/launchers/bin/sbatch +4 -2
  63. experimaestro/tests/launchers/test_slurm.py +80 -0
  64. experimaestro/tests/tasks/test_dynamic.py +231 -0
  65. experimaestro/tests/test_cli_jobs.py +615 -0
  66. experimaestro/tests/test_deprecated.py +630 -0
  67. experimaestro/tests/test_environment.py +200 -0
  68. experimaestro/tests/test_file_progress_integration.py +1 -1
  69. experimaestro/tests/test_forward.py +3 -3
  70. experimaestro/tests/test_identifier.py +372 -41
  71. experimaestro/tests/test_identifier_stability.py +458 -0
  72. experimaestro/tests/test_instance.py +3 -3
  73. experimaestro/tests/test_multitoken.py +442 -0
  74. experimaestro/tests/test_mypy.py +433 -0
  75. experimaestro/tests/test_objects.py +312 -5
  76. experimaestro/tests/test_outputs.py +2 -2
  77. experimaestro/tests/test_param.py +8 -12
  78. experimaestro/tests/test_partial_paths.py +231 -0
  79. experimaestro/tests/test_progress.py +0 -48
  80. experimaestro/tests/test_remote_state.py +671 -0
  81. experimaestro/tests/test_resumable_task.py +480 -0
  82. experimaestro/tests/test_serializers.py +141 -1
  83. experimaestro/tests/test_state_db.py +434 -0
  84. experimaestro/tests/test_subparameters.py +160 -0
  85. experimaestro/tests/test_tags.py +136 -0
  86. experimaestro/tests/test_tasks.py +107 -121
  87. experimaestro/tests/test_token_locking.py +252 -0
  88. experimaestro/tests/test_tokens.py +17 -13
  89. experimaestro/tests/test_types.py +123 -1
  90. experimaestro/tests/test_workspace_triggers.py +158 -0
  91. experimaestro/tests/token_reschedule.py +4 -2
  92. experimaestro/tests/utils.py +2 -2
  93. experimaestro/tokens.py +154 -57
  94. experimaestro/tools/diff.py +1 -1
  95. experimaestro/tui/__init__.py +8 -0
  96. experimaestro/tui/app.py +2395 -0
  97. experimaestro/tui/app.tcss +353 -0
  98. experimaestro/tui/log_viewer.py +228 -0
  99. experimaestro/utils/__init__.py +23 -0
  100. experimaestro/utils/environment.py +148 -0
  101. experimaestro/utils/git.py +129 -0
  102. experimaestro/utils/resources.py +1 -1
  103. experimaestro/version.py +34 -0
  104. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
  105. experimaestro-2.0.0b8.dist-info/RECORD +187 -0
  106. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
  107. experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
  108. experimaestro/compat.py +0 -6
  109. experimaestro/core/objects.pyi +0 -221
  110. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  111. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  112. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  113. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  114. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  115. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  116. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  117. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  118. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  119. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  120. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  121. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  122. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
experimaestro/settings.py CHANGED
@@ -5,6 +5,7 @@ from functools import lru_cache
5
5
  from pathlib import Path
6
6
  from typing import Dict, Optional, List
7
7
  import logging
8
+ import fnmatch
8
9
 
9
10
 
10
11
  @dataclass
@@ -38,6 +39,12 @@ class WorkspaceSettings:
38
39
  alt_workspaces: List[str] = field(default_factory=list)
39
40
  """Alternative workspaces to find jobs or experiments"""
40
41
 
42
+ max_retries: int = 3
43
+ """Maximum number of retries for resumable tasks that timeout (default: 3)"""
44
+
45
+ triggers: List[str] = field(default_factory=list)
46
+ """Glob patterns to automatically select this workspace based on experiment ID"""
47
+
41
48
  def __post_init__(self):
42
49
  self.path = self.path.expanduser().resolve()
43
50
 
@@ -84,9 +91,21 @@ def get_workspace(id: Optional[str] = None) -> Optional[WorkspaceSettings]:
84
91
 
85
92
 
86
93
  def find_workspace(
87
- *, workspace: Optional[str] = None, workdir: Optional[Path] = None
94
+ *,
95
+ workspace: Optional[str] = None,
96
+ workdir: Optional[Path] = None,
97
+ experiment_id: Optional[str] = None,
88
98
  ) -> WorkspaceSettings:
89
- """Find workspace"""
99
+ """Find workspace
100
+
101
+ Args:
102
+ workspace: Explicit workspace ID to use
103
+ workdir: Explicit working directory path
104
+ experiment_id: Experiment ID to match against workspace triggers
105
+
106
+ Returns:
107
+ WorkspaceSettings object
108
+ """
90
109
  workdir = Path(workdir) if workdir else None
91
110
 
92
111
  if workspace:
@@ -103,8 +122,28 @@ def find_workspace(
103
122
  logging.info("Using workdir %s", workdir)
104
123
  ws_env = WorkspaceSettings("", workdir)
105
124
  else:
106
- ws_env = get_workspace()
107
- assert ws_env is not None, "No workdir or workspace defined, and no default"
108
- logging.info("Using default workspace %s", ws_env.id)
125
+ # Try to match experiment_id against workspace triggers
126
+ matched_workspace = None
127
+ if experiment_id:
128
+ workspaces = get_settings().workspaces
129
+ for ws in workspaces:
130
+ for trigger in ws.triggers:
131
+ if fnmatch.fnmatch(experiment_id, trigger):
132
+ matched_workspace = ws
133
+ logging.info(
134
+ "Auto-selected workspace %s (matched trigger '%s')",
135
+ ws.id,
136
+ trigger,
137
+ )
138
+ break
139
+ if matched_workspace:
140
+ break
141
+
142
+ if matched_workspace:
143
+ ws_env = matched_workspace
144
+ else:
145
+ ws_env = get_workspace()
146
+ assert ws_env is not None, "No workdir or workspace defined, and no default"
147
+ logging.info("Using default workspace %s", ws_env.id)
109
148
 
110
149
  return ws_env
@@ -113,12 +113,12 @@ class ConfigDocumenter(ClassDocumenter):
113
113
  @staticmethod
114
114
  def formatDefault(value) -> str:
115
115
  if isinstance(value, Config):
116
- objecttype = value.__xpmtype__.objecttype
116
+ value_type = value.__xpmtype__.value_type
117
117
  params = ", ".join(
118
118
  [f"{key}={value}" for key, value in value.__xpm__.values.items()]
119
119
  )
120
120
  # It would be possible to do better... if not
121
- return f"{objecttype.__module__}.{objecttype.__qualname__}({params})"
121
+ return f"{value_type.__module__}.{value_type.__qualname__}({params})"
122
122
 
123
123
  return str(value)
124
124
 
@@ -176,7 +176,7 @@ class ConfigDocumenter(ClassDocumenter):
176
176
  self.add_line(" " + _("Bases: %s") % ", ".join(base_classes), sourcename)
177
177
 
178
178
  # Adds return type if different
179
- if xpminfo.returntype != xpminfo.objecttype:
179
+ if xpminfo.returntype != xpminfo.value_type:
180
180
  self.add_line("", sourcename)
181
181
  self.add_line(
182
182
  " " + _("Submit type: %s") % restify(xpminfo.returntype), sourcename
@@ -3,6 +3,23 @@ from pathlib import Path
3
3
  from typing import Optional
4
4
 
5
5
 
6
+ class LauncherInformation:
7
+ """Minimal launcher information available during task execution.
8
+
9
+ This is a lightweight class used to query launcher-specific information
10
+ (like remaining time) during task execution. It's set by the generated
11
+ Python script that runs the task.
12
+ """
13
+
14
+ def remaining_time(self) -> Optional[float]:
15
+ """Returns the remaining time in seconds before the job times out.
16
+
17
+ Returns:
18
+ The remaining time in seconds, or None if no time limit.
19
+ """
20
+ return None
21
+
22
+
6
23
  class Env:
7
24
  _instance = None
8
25
 
@@ -12,6 +29,9 @@ class Env:
12
29
  # The current task path
13
30
  taskpath: Optional[Path] = None
14
31
 
32
+ # Launcher information (only set when running a task)
33
+ launcher_info: Optional[LauncherInformation] = None
34
+
15
35
  # Set to True when multi-processing when
16
36
  # in slave mode:
17
37
  # - no progress report
@@ -19,6 +19,86 @@ def xpmdirectory(tmp_path_factory):
19
19
  shutil.rmtree(workdir)
20
20
 
21
21
 
22
+ @pytest.fixture(scope="function", autouse=True)
23
+ def reset_scheduler():
24
+ """Reset scheduler state between tests to avoid state leakage with singleton pattern"""
25
+ from experimaestro.scheduler.base import Scheduler
26
+ from experimaestro.server import Server
27
+
28
+ # Get the singleton instance if it exists
29
+ if Scheduler._instance is not None:
30
+ scheduler = Scheduler._instance
31
+ # Clear job registrations but keep scheduler running
32
+ logging.debug(
33
+ f"FIXTURE: Clearing scheduler before test - jobs count: {len(scheduler.jobs)}"
34
+ )
35
+ # Clear experiment references from all jobs
36
+ for job in scheduler.jobs.values():
37
+ job.experiments.clear()
38
+ scheduler.jobs.clear()
39
+ scheduler.waitingjobs.clear()
40
+ scheduler.experiments.clear()
41
+ # Clear state provider experiment providers to avoid stale references
42
+ if (
43
+ hasattr(scheduler, "state_provider")
44
+ and scheduler.state_provider is not None
45
+ ):
46
+ # Close all experiment providers
47
+ for provider in scheduler.state_provider.experiment_providers.values():
48
+ provider.close()
49
+ scheduler.state_provider.experiment_providers.clear()
50
+ logging.debug("FIXTURE: Cleared state provider experiment providers")
51
+
52
+ # Also clear listeners to prevent stale listeners
53
+ scheduler.clear_listeners()
54
+
55
+ # Re-add state_provider as listener if it exists
56
+ if (
57
+ hasattr(scheduler, "state_provider")
58
+ and scheduler.state_provider is not None
59
+ ):
60
+ scheduler.addlistener(scheduler.state_provider)
61
+
62
+ # Reset server instance too
63
+ if Server._instance is not None:
64
+ logging.debug("FIXTURE: Clearing server instance")
65
+ Server._instance = None
66
+
67
+ yield
68
+
69
+ # Cleanup after test - clear again
70
+ if Scheduler._instance is not None:
71
+ scheduler = Scheduler._instance
72
+ logging.debug(
73
+ f"FIXTURE: Clearing scheduler after test - jobs count: {len(scheduler.jobs)}"
74
+ )
75
+ # Clear experiment references from all jobs
76
+ for job in scheduler.jobs.values():
77
+ job.experiments.clear()
78
+ scheduler.jobs.clear()
79
+ scheduler.waitingjobs.clear()
80
+ scheduler.experiments.clear()
81
+ # Clear state provider experiment providers
82
+ if (
83
+ hasattr(scheduler, "state_provider")
84
+ and scheduler.state_provider is not None
85
+ ):
86
+ for provider in scheduler.state_provider.experiment_providers.values():
87
+ provider.close()
88
+ scheduler.state_provider.experiment_providers.clear()
89
+ scheduler.clear_listeners()
90
+ # Re-add state_provider as listener if it exists
91
+ if (
92
+ hasattr(scheduler, "state_provider")
93
+ and scheduler.state_provider is not None
94
+ ):
95
+ scheduler.addlistener(scheduler.state_provider)
96
+
97
+ # Reset server after test
98
+ if Server._instance is not None:
99
+ Server._instance = None
100
+
101
+
22
102
  # Sets a flag
23
103
  def pytest_configure(config):
24
104
  import sys
@@ -3,7 +3,7 @@
3
3
  from typing import Generic, Optional, TypeVar
4
4
 
5
5
  import pytest
6
- from experimaestro import Config, Param
6
+ from experimaestro import field, Config, Param
7
7
  from experimaestro.core.arguments import Argument
8
8
  from experimaestro.core.types import TypeVarType
9
9
 
@@ -162,7 +162,7 @@ class TreeGenericConfig(Config, Generic[T]):
162
162
  class TagTreeGenericConfig(TreeGenericConfig[T], Generic[T]):
163
163
  """A tagged version of TreeGenericConfig to test recursive generics"""
164
164
 
165
- tag: Param[str] = "default"
165
+ tag: Param[str] = field(ignore_default="default")
166
166
 
167
167
 
168
168
  def test_core_generics_recursive():
@@ -0,0 +1,45 @@
1
+ {
2
+ "bool_false": "bb61efa2769d20e6665fd63911d8a1e2fcdd2af22ff1e6c860d2b26ab7b04ab2",
3
+ "bool_true": "e718f2e3a3cc5b6b816a9645f587d3009efb08642bd22db45c8c288b78ff11f4",
4
+ "cycle_simple": "a73ef01b1c3e4e0187aee95eda96d1c069fd4757ad0137ac66adbf3a9502673f",
5
+ "default_override": "90951821af9c0d84b3f300fadfab63387bbfad6d1982dfbaa5b4d7ebbbfcf800",
6
+ "default_with_default": "0203eb7eb6a13e3c4592c9366f76a5f53dd2c5211c576547873184af86558bc3",
7
+ "dict_empty": "d2c32c9305431266e4ab1f5a70face4cee13b02a01af4ba0a6046fb254971b5f",
8
+ "dict_multiple": "01994d7bc212a73ea9d80332bf460922ca786a9d4ab8d8f444b3673901c75c99",
9
+ "dict_nested_empty": "77ebb66bcfe1c24c166dd80ceaae5840c1729f5c435a9d4ae040e8285b9beca7",
10
+ "dict_nested_multiple": "4476cd6934c5cc4a63cce1594cbbed622d0e70f6291b6d8cb4092d7b2612bb15",
11
+ "dict_nested_single": "373a3e409042029439fdbcb679b4e6388242901772d273ddefafd5663f9e57e4",
12
+ "dict_single": "dd879ad5038694c95134926ab3524696437f6ec96e52341a4e8c8fd44a1c2ae2",
13
+ "enum_value_a": "96c98d4683658b0a8e62d67abcb32c918506f8e455685680ee50f7b174e91366",
14
+ "enum_value_b": "433e23f7e2ee01a850bd97da097cf158873cb7465d3faa86218d86c0f7c38834",
15
+ "enum_value_c": "5881adaa535d2c3c842e4f163fbabbe5f537bc18f48a5bc2e9887ad9b3deb00b",
16
+ "float_negative": "67a86cea76bc90be4ec052e8c2f08829fb989e64a1d419541e68485dff85dba1",
17
+ "float_simple": "f445ab15c80965e89436c9e58a369fb969f216f392e4cbb19846830731f9a1e4",
18
+ "float_zero": "35879908ca1652ea07a4b49f0c44e9aa2ca264bedf14974dd9a765b1fafd1921",
19
+ "instance_different_values": "e175afb36163f56078beb516cf7489238b491571519e25c2b5ff68afbeccc643",
20
+ "instance_separate": "6d6274a5b541f60833e5d15a556a1b8adfaaa6dd0970a09a57849edd7a0c6fdd",
21
+ "instance_shared": "d9a76235da634b81b7559d561322263698852fa2012005781569154d7ad3cfc5",
22
+ "int_negative": "4e2ad6ee44e1c9429b56900aea7ba81a6b746a38678d2e29a89805bfb32b9153",
23
+ "int_positive": "2c57a590b8bf1bb5283a54d971814e196f395269f2973096dc277dbc24218962",
24
+ "int_zero": "2696ea881e0f601d4ad75679560e0e3305fa2f15552952d88ac87df4cc6f9f49",
25
+ "list_empty": "457140939f4e4ea43c5cfa4e05f4db0ed82891f0b550e23bfedf1510aea94d0c",
26
+ "list_multiple": "14575fd83be49b8f23d43d54fab90768ea0c296a829eeaa1b5a312f8322fb2ef",
27
+ "list_nested_empty": "fa30a32619931b4048a9f9854d82975e955c48017cd72b474344fa6b5a9c9bbe",
28
+ "list_nested_multiple": "4cc6e0d3d4ac32209334b8667d6b18f37cc5fd1677309eaeec89e7862d98ec5f",
29
+ "list_nested_single": "160a7e361e3482536479beaf8250f3107436c59e60ca5da573548da60e4b9bcf",
30
+ "list_single": "d33a881039f9a79cb7d688e547acdc79092b86b9e05fcb65faebbebfb38b3067",
31
+ "nested_multi": "8a1a37250d6f90caa549b02f8899dcba51ce01e5f6f511896b6c00b9c4a714a0",
32
+ "nested_simple": "a52569bc853a4cacbc044ba3082bbc10670f9174279e283168115828de979be1",
33
+ "option_override": "e98d969a3a309b2a43bab46c0fce6a6ea3c8c337a63ecb78c3705b8c281927b5",
34
+ "option_with_default": "e98d969a3a309b2a43bab46c0fce6a6ea3c8c337a63ecb78c3705b8c281927b5",
35
+ "str_empty": "bfdf01b69cbed525f27d0a9c1ba1043086ae5705fbc4027f5cf899a394e38bca",
36
+ "str_simple": "ad42020604bcc3c36bbcef8843de7b7c3af80b4198a109b7768d65bc2f788b1a",
37
+ "str_unicode": "26972840e4f5f71b2303902e0247aaf1e27d8a14ab6495c433d1f95c32dd40e8",
38
+ "task_simple": "1ff8ca42cdc94959e1b0c3c019ef4ab1f45b30a4309cc9fef3e42f4ea7da3e86",
39
+ "task_submitted": "834fae0fffb762b20064e8c648221dab99e81ba6f00622219fccce1bd0a18a17",
40
+ "task_using_output": "6c51d8124133038482472973a439d785b7ce53e46bac096e047e0e6cf1fc104e",
41
+ "task_with_config": "b26f8a8f7b1b9f6bda7e9c7e334071097b377ac48caec9d7da7fe98bc8c97c84",
42
+ "task_with_init": "285697abd5eaef36264f640ef790880f076daea4aff1814f1a518aa014ba4b0d",
43
+ "task_with_multiple_init": "d0e8610e1312d9a3398b839c691e16c741d4520823763067465a3ddab63acb30",
44
+ "task_with_output": "9cbaadb16fc6168286703afe35805108c4600abd05380fe56160f50e20b3cbb6"
45
+ }
@@ -13,9 +13,13 @@ fi
13
13
  find "$XPM_SLURM_DIR/jobs" -name "*.start" | while read name; do
14
14
  jobid=${name%.start}
15
15
  sf="$jobid.status"
16
+ timeout_marker="$jobid.timeout"
16
17
  if test -f "$sf"; then
17
18
  exitcode="$(cat $sf)"
18
- if test "$exitcode" == 0; then
19
+ # Check for timeout marker file
20
+ if test -f "$timeout_marker"; then
21
+ status=TIMEOUT
22
+ elif test "$exitcode" == 0; then
19
23
  status=COMPLETED
20
24
  else
21
25
  status=FAILED
@@ -25,4 +29,4 @@ find "$XPM_SLURM_DIR/jobs" -name "*.start" | while read name; do
25
29
  fi
26
30
 
27
31
  echo "$(basename $jobid)|${status}|$(cat ${jobid}.start)|$(cat ${jobid}.start)|"
28
- done
32
+ done
@@ -65,12 +65,14 @@ done < "$1"
65
65
 
66
66
  cd "$chdir"
67
67
  echo "Starting $@ ${args[@]} > $stdout 2> $stderr" >&2
68
+ # Get job ID before forking
69
+ JOBID="$$"
68
70
  (
69
71
  export PATH="${CURDIR}/bin:$PATH"
72
+ export SLURM_JOB_ID="$JOBID"
70
73
  eval "$@" "${args[@]}"
71
- echo $? > "$XPM_SLURM_DIR/jobs/$$.status"
74
+ echo $? > "$XPM_SLURM_DIR/jobs/$JOBID.status"
72
75
  ) > $stdout 2> $stderr &
73
- JOBID="$$"
74
76
  date > "$XPM_SLURM_DIR/jobs/$JOBID.start"
75
77
  disown
76
78
 
@@ -6,6 +6,8 @@ from experimaestro.connectors.local import LocalConnector
6
6
  from experimaestro.launchers.slurm import (
7
7
  SlurmLauncher,
8
8
  )
9
+ from experimaestro import field, ResumableTask, Param
10
+ from experimaestro.scheduler import JobState
9
11
  import shutil
10
12
  import pytest
11
13
  from .common import waitFromSpec, takeback
@@ -84,3 +86,81 @@ def test_slurm_takeback(slurmlauncher, tmp_path):
84
86
  datapath = tmp_path / "data"
85
87
 
86
88
  takeback(slurmlauncher, datapath, txp1, txp2)
89
+
90
+
91
+ class SlurmResumableTask(ResumableTask):
92
+ """ResumableTask that simulates timeout on first N attempts for SLURM testing"""
93
+
94
+ checkpoint: Param[Path]
95
+ timeout_count: Param[int] = field(ignore_default=2)
96
+ slurm_jobs_dir: Param[Path] # Path to mock SLURM jobs directory
97
+ output_file: Param[Path] = field(ignore_default=None)
98
+
99
+ def execute(self):
100
+ import os
101
+
102
+ # Read current attempt count from checkpoint
103
+ attempt = 1
104
+ if self.checkpoint.exists():
105
+ attempt = int(self.checkpoint.read_text()) + 1
106
+
107
+ print(f"SlurmResumableTask attempt #{attempt}")
108
+
109
+ # Write updated attempt count
110
+ self.checkpoint.write_text(str(attempt))
111
+
112
+ # Simulate timeout for first timeout_count attempts
113
+ if attempt <= self.timeout_count:
114
+ print(f"Simulating SLURM TIMEOUT on attempt {attempt}")
115
+ # Create timeout marker file for mock SLURM
116
+ # The marker needs to be named <jobid>.timeout in the SLURM jobs directory
117
+ # Use SLURM_JOB_ID environment variable (set by mock sbatch, like real SLURM)
118
+ job_id = os.environ.get("SLURM_JOB_ID")
119
+ if job_id:
120
+ timeout_marker = self.slurm_jobs_dir / f"{job_id}.timeout"
121
+ timeout_marker.write_text(f"timeout on attempt {attempt}")
122
+ # Exit with error to trigger SLURM timeout handling
123
+ raise RuntimeError(f"Simulated timeout on attempt {attempt}")
124
+
125
+ # Success - task completed
126
+ print(f"Task completed successfully on attempt {attempt}")
127
+ if self.output_file:
128
+ self.output_file.write_text(f"Completed after {attempt} attempts")
129
+
130
+
131
+ @pytest.mark.timeout(30)
132
+ def test_slurm_resumable_task(tmp_path: Path, slurmlauncher: SlurmLauncher):
133
+ """Test that ResumableTask retries and resumes after SLURM timeouts"""
134
+ with TemporaryExperiment("slurm-resumable", workdir=tmp_path / "xp", maxwait=25):
135
+ checkpoint = tmp_path / "checkpoint.txt"
136
+ output_file = tmp_path / "output.txt"
137
+
138
+ # Get the SLURM jobs directory from the launcher's binpath
139
+ slurm_jobs_dir = slurmlauncher.binpath.parent / "slurm" / "jobs"
140
+
141
+ # Submit task with max_retries to allow multiple timeout retries
142
+ task = SlurmResumableTask.C(
143
+ checkpoint=checkpoint,
144
+ timeout_count=2, # Timeout on first 2 attempts
145
+ slurm_jobs_dir=slurm_jobs_dir,
146
+ output_file=output_file,
147
+ ).submit(launcher=slurmlauncher, max_retries=5)
148
+
149
+ # Wait for the task to complete
150
+ state = task.__xpm__.job.wait()
151
+
152
+ # Verify task completed successfully after retries
153
+ assert state == JobState.DONE, f"Task did not complete successfully: {state}"
154
+ assert (
155
+ task.__xpm__.job.retry_count == 2
156
+ ), f"Expected 2 retries, got {task.__xpm__.job.retry_count}"
157
+
158
+ # Verify checkpoint shows 3 attempts (2 timeouts + 1 success)
159
+ assert checkpoint.exists(), "Checkpoint file was not created"
160
+ assert (
161
+ int(checkpoint.read_text()) == 3
162
+ ), f"Expected 3 attempts, got {checkpoint.read_text()}"
163
+
164
+ # Verify output file was created on success
165
+ assert output_file.exists(), "Output file was not created"
166
+ assert "Completed after 3 attempts" in output_file.read_text()
@@ -0,0 +1,231 @@
1
+ # Test for future task outputs handling
2
+ # https://github.com/experimaestro/experimaestro-python/issues/90
3
+
4
+ from functools import partial
5
+ import json
6
+ import logging
7
+ from pathlib import Path
8
+ import sys
9
+ import time
10
+ from experimaestro import (
11
+ Config,
12
+ Param,
13
+ Task,
14
+ ResumableTask,
15
+ DependentMarker,
16
+ LightweightTask,
17
+ field,
18
+ PathGenerator,
19
+ )
20
+ from experimaestro.core.arguments import Meta
21
+ from experimaestro.tests.utils import TemporaryDirectory, TemporaryExperiment
22
+
23
+
24
+ class Model(Config):
25
+ pass
26
+
27
+
28
+ class Checkpoint(Config):
29
+ step: Param[int]
30
+ model: Param[Model]
31
+
32
+
33
+ class CheckpointLoader(LightweightTask):
34
+ checkpoint: Param[Checkpoint]
35
+
36
+ def execute(self):
37
+ pass
38
+
39
+
40
+ class Evaluate(Task):
41
+ model: Param[Model]
42
+
43
+ def execute(self):
44
+ pass
45
+
46
+
47
+ class Validation(Config):
48
+ model: Param[Model]
49
+
50
+ def checkpoint(self, dep: DependentMarker, *, step: int) -> Checkpoint:
51
+ return dep(Checkpoint.C(model=self.model, step=step))
52
+
53
+ def compute(self, step: int):
54
+ self.register_task_output(self.checkpoint, step=step)
55
+
56
+
57
+ class Learn(ResumableTask):
58
+ model: Param[Model]
59
+ validation: Param[Validation]
60
+
61
+ # Control files for synchronization with tests
62
+ max_step_file: Meta[Path] = field(default_factory=PathGenerator("max_step"))
63
+ state_file: Meta[Path] = field(default_factory=PathGenerator("state.json"))
64
+
65
+ def execute(self):
66
+ start_step = 0
67
+
68
+ if self.state_file.exists():
69
+ with self.state_file.open("r") as f:
70
+ state = json.load(f)
71
+ start_step = state.get("last_step", 0)
72
+ logging.info("Resuming from step %d", start_step)
73
+
74
+ # Wait for max_step_file to know how far to go
75
+ while not self.max_step_file.is_file():
76
+ time.sleep(0.1)
77
+
78
+ with self.max_step_file.open("r") as f:
79
+ max_step = int(f.read().strip())
80
+ self.max_step_file.unlink()
81
+
82
+ # Use absolute value for step comparison
83
+ # Negative max_step means: produce up to |max_step| then crash (simulate interruption)
84
+ # Positive max_step means: produce up to max_step then complete normally
85
+ abs_max = abs(max_step)
86
+
87
+ for step in [15, 30, 45]:
88
+ if step <= start_step:
89
+ logging.info("Skipping already processed step %d", step)
90
+ continue
91
+
92
+ if step > abs_max:
93
+ # We're past the limit, stop here
94
+ break
95
+
96
+ self.validation.compute(step)
97
+
98
+ # Save state after each checkpoint
99
+ with self.state_file.open("w") as f:
100
+ json.dump({"last_step": step}, f)
101
+
102
+ # If max_step is negative (e.g. -15), simulate exit after producing |max_step|
103
+ if max_step < 0 and step >= abs_max:
104
+ logging.warning("Simulating interruption after step %d", step)
105
+ sys.exit(1)
106
+
107
+
108
+ def evaluate(evaluations, checkpoint: Checkpoint):
109
+ logging.warning("Evaluating checkpoint %s", checkpoint)
110
+ task = Evaluate.C(model=checkpoint.model)
111
+ checkpoint_loader = CheckpointLoader.C(checkpoint=checkpoint)
112
+ evaluations.append(task.submit(init_tasks=[checkpoint_loader]))
113
+
114
+
115
+ def test_task_dynamic_simple():
116
+ """Test that dynamic task outputs trigger callbacks
117
+
118
+ This test verifies that callbacks are guaranteed to complete before
119
+ the experiment context exits. The callback waits for jobs to complete
120
+ before submitting evaluations, which validates that the synchronization
121
+ logic correctly waits for all callbacks to finish.
122
+ """
123
+ import asyncio
124
+
125
+ evaluations = []
126
+ xp_ref = [None] # To access xp from callback
127
+
128
+ def collect_checkpoint(checkpoint: Checkpoint):
129
+ """Callback that waits for jobs to complete before evaluating
130
+
131
+ This simulates a real-world scenario where the callback needs to wait
132
+ for the triggering task to complete before it can proceed (e.g., to
133
+ read outputs from the task's directory).
134
+ """
135
+ logging.info("Received checkpoint %s, waiting for jobs to complete", checkpoint)
136
+ xp = xp_ref[0]
137
+
138
+ # Wait for unfinished jobs to become 0 (all tasks completed)
139
+ async def wait_for_jobs_done():
140
+ async with xp.scheduler.exitCondition:
141
+ while xp.unfinishedJobs > 0:
142
+ await xp.scheduler.exitCondition.wait()
143
+
144
+ asyncio.run_coroutine_threadsafe(
145
+ wait_for_jobs_done(), xp.scheduler.loop
146
+ ).result()
147
+
148
+ # Now submit evaluation
149
+ logging.info("Jobs done, submitting evaluation for checkpoint %s", checkpoint)
150
+ evaluate(evaluations, checkpoint)
151
+
152
+ with TemporaryDirectory() as workdir:
153
+ with TemporaryExperiment("dynamic", maxwait=10, workdir=workdir) as xp:
154
+ xp_ref[0] = xp
155
+ model = Model.C()
156
+ validation = Validation.C(model=model)
157
+ learn = Learn.C(model=model, validation=validation)
158
+ learn.watch_output(validation.checkpoint, collect_checkpoint)
159
+
160
+ learn.submit()
161
+
162
+ # Allow the task to run up to step 30
163
+ learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
164
+ with learn.max_step_file.open("w") as f:
165
+ f.write("30")
166
+
167
+ logging.info("Experiment will wait for completion...")
168
+
169
+ assert len(evaluations) == 2, f"Expected 2 evaluations, got {len(evaluations)}"
170
+
171
+
172
+ def test_task_dynamic_replay():
173
+ """Test that dynamic outputs are replayed when a task is restarted
174
+
175
+ Scenario:
176
+ 1. First run: task produces checkpoint for step 15, then exits (simulated timeout)
177
+ 2. Second run: task should replay the step 15 checkpoint and produce new ones
178
+ """
179
+ with TemporaryDirectory() as workdir:
180
+ # First run: produce one checkpoint then exit
181
+ evaluations_run1 = []
182
+ try:
183
+ with TemporaryExperiment("dynamic_replay", maxwait=5, workdir=workdir):
184
+ model = Model.C()
185
+ validation = Validation.C(model=model)
186
+ learn = Learn.C(model=model, validation=validation)
187
+ learn.watch_output(
188
+ validation.checkpoint, partial(evaluate, evaluations_run1)
189
+ )
190
+
191
+ learn.submit()
192
+
193
+ # Allow task to produce step 15 checkpoint, then simulate crash
194
+ # Negative value means: produce up to |value| then exit with error
195
+ learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
196
+ with learn.max_step_file.open("w") as f:
197
+ f.write("-15")
198
+
199
+ except Exception as e:
200
+ # Expected: the task will fail when trying to go past max_step
201
+ logging.info("First run ended (expected): %s", e)
202
+
203
+ # First run should have produced at least one evaluation (for step 15)
204
+ assert (
205
+ len(evaluations_run1) == 1
206
+ ), f"Run 1: Expected 1 evaluation, got {len(evaluations_run1)}"
207
+
208
+ # Second run: restart and continue
209
+ evaluations_run2 = []
210
+ with TemporaryExperiment("dynamic_replay", maxwait=30, workdir=workdir):
211
+ model = Model.C()
212
+ validation = Validation.C(model=model)
213
+ learn = Learn.C(model=model, validation=validation)
214
+ learn.watch_output(
215
+ validation.checkpoint, partial(evaluate, evaluations_run2)
216
+ )
217
+
218
+ learn.submit()
219
+
220
+ # Allow task to run to completion (step 45)
221
+ learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
222
+ with learn.max_step_file.open("w") as f:
223
+ f.write("45")
224
+
225
+ # Second run should have:
226
+ # - Replayed the step 15 checkpoint (from first run)
227
+ # - Produced step 30 and 45 checkpoints
228
+ # Total: 3 evaluations (but step 15 was replayed, not re-produced)
229
+ assert (
230
+ len(evaluations_run2) == 3
231
+ ), f"Run 2: Expected 3 evaluations, got {len(evaluations_run2)}"