sl-shared-assets 2.0.1__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sl-shared-assets might be problematic. Click here for more details.

Files changed (32) hide show
  1. sl_shared_assets/__init__.py +17 -9
  2. sl_shared_assets/__init__.pyi +12 -8
  3. sl_shared_assets/cli.py +258 -21
  4. sl_shared_assets/cli.pyi +44 -5
  5. sl_shared_assets/data_classes/__init__.py +8 -3
  6. sl_shared_assets/data_classes/__init__.pyi +8 -4
  7. sl_shared_assets/data_classes/configuration_data.py +149 -30
  8. sl_shared_assets/data_classes/configuration_data.pyi +49 -11
  9. sl_shared_assets/data_classes/runtime_data.py +70 -49
  10. sl_shared_assets/data_classes/runtime_data.pyi +41 -33
  11. sl_shared_assets/data_classes/session_data.py +193 -253
  12. sl_shared_assets/data_classes/session_data.pyi +99 -116
  13. sl_shared_assets/data_classes/surgery_data.py +1 -1
  14. sl_shared_assets/server/__init__.py +2 -2
  15. sl_shared_assets/server/__init__.pyi +5 -2
  16. sl_shared_assets/server/job.py +229 -1
  17. sl_shared_assets/server/job.pyi +111 -0
  18. sl_shared_assets/server/server.py +431 -31
  19. sl_shared_assets/server/server.pyi +158 -15
  20. sl_shared_assets/tools/__init__.py +2 -1
  21. sl_shared_assets/tools/__init__.pyi +2 -0
  22. sl_shared_assets/tools/ascension_tools.py +9 -21
  23. sl_shared_assets/tools/ascension_tools.pyi +1 -1
  24. sl_shared_assets/tools/packaging_tools.py +2 -2
  25. sl_shared_assets/tools/project_management_tools.py +147 -41
  26. sl_shared_assets/tools/project_management_tools.pyi +45 -6
  27. {sl_shared_assets-2.0.1.dist-info → sl_shared_assets-3.0.0.dist-info}/METADATA +127 -13
  28. sl_shared_assets-3.0.0.dist-info/RECORD +36 -0
  29. {sl_shared_assets-2.0.1.dist-info → sl_shared_assets-3.0.0.dist-info}/entry_points.txt +2 -0
  30. sl_shared_assets-2.0.1.dist-info/RECORD +0 -36
  31. {sl_shared_assets-2.0.1.dist-info → sl_shared_assets-3.0.0.dist-info}/WHEEL +0 -0
  32. {sl_shared_assets-2.0.1.dist-info → sl_shared_assets-3.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,84 +1,41 @@
1
+ from enum import StrEnum
1
2
  from pathlib import Path
2
3
  from dataclasses import field, dataclass
3
4
 
4
5
  from _typeshed import Incomplete
5
6
  from ataraxis_data_structures import YamlConfig
6
7
 
7
- from .configuration_data import get_system_configuration_data as get_system_configuration_data
8
+ from .configuration_data import (
9
+ AcquisitionSystems as AcquisitionSystems,
10
+ get_system_configuration_data as get_system_configuration_data,
11
+ )
8
12
 
9
- _valid_session_types: Incomplete
13
+ class SessionTypes(StrEnum):
14
+ """Defines the set of data acquisition session types supported by various data acquisition systems used in the
15
+ Sun lab.
10
16
 
11
- @dataclass()
12
- class VersionData(YamlConfig):
13
- """Stores information about the versions of important Sun lab libraries used to acquire the session's data."""
14
-
15
- python_version: str = ...
16
- sl_experiment_version: str = ...
17
-
18
- @dataclass()
19
- class ProjectConfiguration(YamlConfig):
20
- """Stores the project-specific configuration parameters that do not change between different animals and runtime
21
- sessions.
22
-
23
- An instance of this class is generated and saved as a .yaml file in the 'configuration' directory of each project
24
- when it is created. After that, the stored data is reused for every runtime (training or experiment session) carried
25
- out for each animal of the project. Additionally, a copy of the most actual configuration file is saved inside each
26
- runtime session's 'raw_data' folder, providing seamless integration between the managed data and various Sun lab
27
- (sl-) libraries.
17
+ A data acquisition session broadly encompasses a recording session carried out to either: acquire experiment data,
18
+ train the animal for the upcoming experiments, or to assess the quality of surgical or other pre-experiment
19
+ intervention.
28
20
 
29
21
  Notes:
30
- Together with SessionData, this class forms the entry point for all interactions with the data acquired in the
31
- Sun lab. The fields of this class are used to flexibly configure the runtime behavior of major data acquisition
32
- (sl-experiment) and processing (sl-forgery) libraries, adapting them for any project in the lab.
22
+ This enumeration does not differentiate between different acquisition systems. Different acquisition systems
23
+ support different session types, and may not be suited for acquiring some of the session types listed in this
24
+ enumeration.
33
25
  """
34
26
 
35
- project_name: str = ...
36
- surgery_sheet_id: str = ...
37
- water_log_sheet_id: str = ...
38
- @classmethod
39
- def load(cls, configuration_path: Path) -> ProjectConfiguration:
40
- """Loads the project configuration parameters from the specified project_configuration.yaml file.
41
-
42
- This method is called during each interaction with any runtime session's data, including the creation of a new
43
- session.
44
-
45
- Args:
46
- configuration_path: The path to the project_configuration.yaml file from which to load the data.
47
-
48
- Returns:
49
- The initialized ProjectConfiguration instance that stores the configuration data for the target project.
50
-
51
- Raise:
52
- FileNotFoundError: If the specified configuration file does not exist or is not a valid YAML file.
53
- """
54
- def save(self, path: Path) -> None:
55
- """Saves class instance data to disk as a project_configuration.yaml file.
56
-
57
- This method is automatically called from the 'sl_experiment' library when a new project is created. After this
58
- method's runtime, all future project initialization calls will use the load() method to reuse configuration data
59
- saved to the .yaml file created by this method.
60
-
61
- Args:
62
- path: The path to the .yaml file to save the data to.
63
- """
64
- def _verify_data(self) -> None:
65
- """Verifies the user-modified data loaded from the project_configuration.yaml file.
66
-
67
- Since this class is explicitly designed to be modified by the user, this verification step is carried out to
68
- ensure that the loaded data matches expectations. This reduces the potential for user errors to impact the
69
- runtime behavior of the libraries using this class. This internal method is automatically called by the load()
70
- method.
71
-
72
- Raises:
73
- ValueError: If the loaded data does not match expected formats or values.
74
- """
27
+ LICK_TRAINING = "lick training"
28
+ RUN_TRAINING = "run training"
29
+ MESOSCOPE_EXPERIMENT = "mesoscope experiment"
30
+ WINDOW_CHECKING = "window checking"
75
31
 
76
32
  @dataclass()
77
33
  class RawData:
78
34
  """Stores the paths to the directories and files that make up the 'raw_data' session-specific directory.
79
35
 
80
- The raw_data directory stores the data acquired during the session runtime before and after preprocessing. Since
81
- preprocessing does not alter the data, any data in that folder is considered 'raw'.
36
+ The raw_data directory stores the data acquired during the session data acquisition runtime, before and after
37
+ preprocessing. Since preprocessing does not irreversibly alter the data, any data in that folder is considered
38
+ 'raw,' event if preprocessing losslessly re-compresses the data for efficient transfer.
82
39
 
83
40
  Notes:
84
41
  Sun lab data management strategy primarily relies on keeping multiple redundant copies of the raw_data for
@@ -94,7 +51,6 @@ class RawData:
94
51
  session_descriptor_path: Path = ...
95
52
  hardware_state_path: Path = ...
96
53
  surgery_metadata_path: Path = ...
97
- project_configuration_path: Path = ...
98
54
  session_data_path: Path = ...
99
55
  experiment_configuration_path: Path = ...
100
56
  mesoscope_positions_path: Path = ...
@@ -103,21 +59,24 @@ class RawData:
103
59
  checksum_path: Path = ...
104
60
  telomere_path: Path = ...
105
61
  ubiquitin_path: Path = ...
62
+ nk_path: Path = ...
106
63
  integrity_verification_tracker_path: Path = ...
107
- version_data_path: Path = ...
108
64
  def resolve_paths(self, root_directory_path: Path) -> None:
109
65
  """Resolves all paths managed by the class instance based on the input root directory path.
110
66
 
111
- This method is called each time the class is instantiated to regenerate the managed path hierarchy on any
112
- machine that instantiates the class.
67
+ This method is called each time the (wrapper) SessionData class is instantiated to regenerate the managed path
68
+ hierarchy on any machine that instantiates the class.
113
69
 
114
70
  Args:
115
- root_directory_path: The path to the top-level directory of the local hierarchy. Depending on the managed
116
- hierarchy, this has to point to a directory under the main /session, /animal, or /project directory of
117
- the managed session.
71
+ root_directory_path: The path to the top-level directory of the session. Typically, this path is assembled
72
+ using the following hierarchy: root/project/animal/session_id
118
73
  """
119
74
  def make_directories(self) -> None:
120
- """Ensures that all major subdirectories and the root directory exist, creating any missing directories."""
75
+ """Ensures that all major subdirectories and the root directory exist, creating any missing directories.
76
+
77
+ This method is called each time the (wrapper) SessionData class is instantiated and allowed to generate
78
+ missing data directories.
79
+ """
121
80
 
122
81
  @dataclass()
123
82
  class ProcessedData:
@@ -132,53 +91,52 @@ class ProcessedData:
132
91
  camera_data_path: Path = ...
133
92
  mesoscope_data_path: Path = ...
134
93
  behavior_data_path: Path = ...
135
- job_logs_path: Path = ...
136
94
  suite2p_processing_tracker_path: Path = ...
137
- dataset_formation_tracker_path: Path = ...
138
95
  behavior_processing_tracker_path: Path = ...
139
96
  video_processing_tracker_path: Path = ...
97
+ p53_path: Path = ...
140
98
  def resolve_paths(self, root_directory_path: Path) -> None:
141
99
  """Resolves all paths managed by the class instance based on the input root directory path.
142
100
 
143
- This method is called each time the class is instantiated to regenerate the managed path hierarchy on any
144
- machine that instantiates the class.
101
+ This method is called each time the (wrapper) SessionData class is instantiated to regenerate the managed path
102
+ hierarchy on any machine that instantiates the class.
145
103
 
146
104
  Args:
147
- root_directory_path: The path to the top-level directory of the local hierarchy. Depending on the managed
148
- hierarchy, this has to point to a directory under the main /session, /animal, or /project directory of
149
- the managed session.
105
+ root_directory_path: The path to the top-level directory of the session. Typically, this path is assembled
106
+ using the following hierarchy: root/project/animal/session_id
150
107
  """
151
108
  def make_directories(self) -> None:
152
- """Ensures that all major subdirectories and the root directory exist, creating any missing directories."""
109
+ """Ensures that all major subdirectories and the root directory exist, creating any missing directories.
110
+
111
+ This method is called each time the (wrapper) SessionData class is instantiated and allowed to generate
112
+ missing data directories.
113
+ """
153
114
 
154
115
  @dataclass
155
116
  class SessionData(YamlConfig):
156
- """Stores and manages the data layout of a single training or experiment session acquired in the Sun lab.
157
-
158
- The primary purpose of this class is to maintain the session data structure across all supported destinations and
159
- during all processing stages. It generates the paths used by all other classes from all Sun lab libraries that
160
- interact with the session's data from the point of its creation and until the data is integrated into an
161
- analysis dataset.
117
+ """Stores and manages the data layout of a single Sun lab data acquisition session.
162
118
 
163
- When necessary, the class can be used to either generate a new session or load the layout of an already existing
164
- session. When the class is used to create a new session, it generates the new session's name using the current
165
- UTC timestamp, accurate to microseconds. This ensures that each session name is unique and preserves the overall
166
- session order.
119
+ The primary purpose of this class is to maintain the session data structure across all supported destinations and to
120
+ provide a unified data access interface shared by all Sun lab libraries. The class can be used to either generate a
121
+ new session or load the layout of an already existing session. When the class is used to create a new session, it
122
+ generates the new session's name using the current UTC timestamp, accurate to microseconds. This ensures that each
123
+ session 'name' is unique and preserves the overall session order.
167
124
 
168
125
  Notes:
169
126
  This class is specifically designed for working with the data from a single session, performed by a single
170
127
  animal under the specific experiment. The class is used to manage both raw and processed data. It follows the
171
- data through acquisition, preprocessing and processing stages of the Sun lab data workflow. Together with
172
- ProjectConfiguration class, this class serves as an entry point for all interactions with the managed session's
173
- data.
128
+ data through acquisition, preprocessing and processing stages of the Sun lab data workflow. This class serves as
129
+ an entry point for all interactions with the managed session's data.
174
130
  """
175
131
 
176
132
  project_name: str
177
133
  animal_id: str
178
134
  session_name: str
179
- session_type: str
180
- acquisition_system: str
135
+ session_type: str | SessionTypes
136
+ acquisition_system: str | AcquisitionSystems
181
137
  experiment_name: str | None
138
+ python_version: str = ...
139
+ sl_experiment_version: str = ...
182
140
  raw_data: RawData = field(default_factory=Incomplete)
183
141
  processed_data: ProcessedData = field(default_factory=Incomplete)
184
142
  def __post_init__(self) -> None:
@@ -188,9 +146,11 @@ class SessionData(YamlConfig):
188
146
  cls,
189
147
  project_name: str,
190
148
  animal_id: str,
191
- session_type: str,
149
+ session_type: SessionTypes | str,
192
150
  experiment_name: str | None = None,
193
151
  session_name: str | None = None,
152
+ python_version: str = "3.11.13",
153
+ sl_experiment_version: str = "2.0.0",
194
154
  ) -> SessionData:
195
155
  """Creates a new SessionData object and generates the new session's data structure on the local PC.
196
156
 
@@ -201,22 +161,27 @@ class SessionData(YamlConfig):
201
161
  To load an already existing session data structure, use the load() method instead.
202
162
 
203
163
  This method automatically dumps the data of the created SessionData instance into the session_data.yaml file
204
- inside the root raw_data directory of the created hierarchy. It also finds and dumps other configuration
205
- files, such as project_configuration.yaml, experiment_configuration.yaml, and system_configuration.yaml into
206
- the same raw_data directory. This ensures that if the session's runtime is interrupted unexpectedly, the
207
- acquired data can still be processed.
164
+ inside the root 'raw_data' directory of the created hierarchy. It also finds and dumps other configuration
165
+ files, such as experiment_configuration.yaml and system_configuration.yaml into the same 'raw_data'
166
+ directory. If the session's runtime is interrupted unexpectedly, the acquired data can still be processed
167
+ using these pre-saved class instances.
208
168
 
209
169
  Args:
210
- project_name: The name of the project for which the data is acquired.
211
- animal_id: The ID code of the animal for which the data is acquired.
212
- session_type: The type of the session. Primarily, this determines how to read the session_descriptor.yaml
213
- file. Valid options are 'Lick training', 'Run training', 'Window checking', or 'Experiment'.
214
- experiment_name: The name of the experiment executed during managed session. This optional argument is only
215
- used for 'Experiment' session types. It is used to find the experiment configuration .YAML file.
216
- session_name: An optional session_name override. Generally, this argument should not be provided for most
170
+ project_name: The name of the project for which the session is carried out.
171
+ animal_id: The ID code of the animal participating in the session.
172
+ session_type: The type of the session. Has to be one of the supported session types exposed by the
173
+ SessionTypes enumeration.
174
+ experiment_name: The name of the experiment executed during the session. This optional argument is only
175
+ used for experiment sessions. Note! The name passed to this argument has to match the name of the
176
+ experiment configuration .yaml file.
177
+ session_name: An optional session name override. Generally, this argument should not be provided for most
217
178
  sessions. When provided, the method uses this name instead of generating a new timestamp-based name.
218
179
  This is only used during the 'ascension' runtime to convert old data structures to the modern
219
180
  lab standards.
181
+ python_version: The string that specifies the Python version used to collect session data. Has to be
182
+ specified using the major.minor.patch version format.
183
+ sl_experiment_version: The string that specifies the version of the sl-experiment library used to collect
184
+ session data. Has to be specified using the major.minor.patch version format.
220
185
 
221
186
  Returns:
222
187
  An initialized SessionData instance that stores the layout of the newly created session's data.
@@ -228,9 +193,9 @@ class SessionData(YamlConfig):
228
193
  """Loads the SessionData instance from the target session's session_data.yaml file.
229
194
 
230
195
  This method is used to load the data layout information of an already existing session. Primarily, this is used
231
- when preprocessing or processing session data. Due to how SessionData is stored and used in the lab, this
232
- method always loads the data layout from the session_data.yaml file stored inside the raw_data session
233
- subfolder. Currently, all interactions with Sun lab data require access to the 'raw_data' folder.
196
+ when processing session data. Due to how SessionData is stored and used in the lab, this method always loads the
197
+ data layout from the session_data.yaml file stored inside the 'raw_data' session subfolder. Currently, all
198
+ interactions with Sun lab data require access to the 'raw_data' folder of each session.
234
199
 
235
200
  Notes:
236
201
  To create a new session, use the create() method instead.
@@ -250,11 +215,18 @@ class SessionData(YamlConfig):
250
215
  Raises:
251
216
  FileNotFoundError: If the 'session_data.yaml' file is not found under the session_path/raw_data/ subfolder.
252
217
 
218
+ """
219
+ def runtime_initialized(self) -> None:
220
+ """Ensures that the 'nk.bin' marker file is removed from the session's raw_data folder.
221
+
222
+ The 'nk.bin' marker is generated as part of the SessionData initialization (creation) process to mark sessions
223
+ that did not fully initialize during runtime. This service method is designed to be called by the sl-experiment
224
+ library classes to remove the 'nk.bin' marker when it is safe to do so. It should not be called by end-users.
253
225
  """
254
226
  def _save(self) -> None:
255
227
  """Saves the instance data to the 'raw_data' directory of the managed session as a 'session_data.yaml' file.
256
228
 
257
- This is used to save the data stored in the instance to disk, so that it can be reused during preprocessing or
229
+ This is used to save the data stored in the instance to disk, so that it can be reused during further stages of
258
230
  data processing. The method is intended to only be used by the SessionData instance itself during its
259
231
  create() method runtime.
260
232
  """
@@ -274,6 +246,13 @@ class ProcessingTracker(YamlConfig):
274
246
  _is_running: bool = ...
275
247
  _lock_path: str = field(init=False)
276
248
  def __post_init__(self) -> None: ...
249
+ def __del__(self) -> None:
250
+ """If the instance is garbage-collected without calling the stop() method, assumes this is due to a runtime
251
+ error.
252
+
253
+ It is essential to always resolve the runtime as either 'stopped' or 'erred' to avoid deadlocking the session
254
+ data.
255
+ """
277
256
  def _load_state(self) -> None:
278
257
  """Reads the current processing state from the wrapped .YAML file."""
279
258
  def _save_state(self) -> None:
@@ -300,7 +279,11 @@ class ProcessingTracker(YamlConfig):
300
279
  TimeoutError: If the file lock for the target .YAML file cannot be acquired within the timeout period.
301
280
  """
302
281
  def stop(self) -> None:
303
- """Mark processing as started.
282
+ """Configures the tracker file to indicate that the tracked processing runtime has been completed successfully.
283
+
284
+ After this method returns, it is UNSAFE to do any further processing from the process that calls this method.
285
+ Any process that calls the 'start' method of this class is expected to also call this method or 'error' method
286
+ at the end of the runtime.
304
287
 
305
288
  Raises:
306
289
  TimeoutError: If the file lock for the target .YAML file cannot be acquired within the timeout period.
@@ -308,12 +291,12 @@ class ProcessingTracker(YamlConfig):
308
291
  @property
309
292
  def is_complete(self) -> bool:
310
293
  """Returns True if the tracker wrapped by the instance indicates that the processing runtime has been completed
311
- successfully and False otherwise."""
294
+ successfully at least once and that there is no ongoing processing that uses the target session."""
312
295
  @property
313
296
  def encountered_error(self) -> bool:
314
- """Returns True if the tracker wrapped by the instance indicates that the processing runtime aborted due to
315
- encountering an error and False otherwise."""
297
+ """Returns True if the tracker wrapped by the instance indicates that the processing runtime for the target
298
+ session has aborted due to encountering an error."""
316
299
  @property
317
300
  def is_running(self) -> bool:
318
301
  """Returns True if the tracker wrapped by the instance indicates that the processing runtime is currently
319
- running and False otherwise."""
302
+ running for the target session."""
@@ -51,7 +51,7 @@ class ProcedureData:
51
51
  surgery_quality: int = 0
52
52
  """Stores the quality of the surgical intervention as a numeric level. 0 indicates unusable (bad) result, 1
53
53
  indicates usable result that is not good enough to be included in a publication, 2 indicates publication-grade
54
- result."""
54
+ result, 3 indicates high-tier publication grade result."""
55
55
 
56
56
 
57
57
  @dataclass
@@ -2,7 +2,7 @@
2
2
  and other compute servers. This package is also used across all Sun lab members private code to interface with the
3
3
  shared server."""
4
4
 
5
- from .job import Job
5
+ from .job import Job, JupyterJob
6
6
  from .server import Server, ServerCredentials, generate_server_credentials
7
7
 
8
- __all__ = ["Server", "ServerCredentials", "generate_server_credentials", "Job"]
8
+ __all__ = ["Server", "ServerCredentials", "generate_server_credentials", "Job", "JupyterJob"]
@@ -1,8 +1,11 @@
1
- from .job import Job as Job
1
+ from .job import (
2
+ Job as Job,
3
+ JupyterJob as JupyterJob,
4
+ )
2
5
  from .server import (
3
6
  Server as Server,
4
7
  ServerCredentials as ServerCredentials,
5
8
  generate_server_credentials as generate_server_credentials,
6
9
  )
7
10
 
8
- __all__ = ["Server", "ServerCredentials", "generate_server_credentials", "Job"]
11
+ __all__ = ["Server", "ServerCredentials", "generate_server_credentials", "Job", "JupyterJob"]
@@ -1,13 +1,51 @@
1
1
  """This module provides the core Job class, used as the starting point for all SLURM-managed job executed on lab compute
2
2
  server(s). Specifically, the Job class acts as a wrapper around the SLURM configuration and specific logic of each
3
3
  job. During runtime, Server class interacts with input job objects to manage their transfer and execution on the
4
- remote servers."""
4
+ remote servers.
5
+
6
+ Since version 3.0.0, this module also provides the specialized JupyterJob class used to launch remote Jupyter
7
+ notebook servers.
8
+ """
5
9
 
6
10
  # noinspection PyProtectedMember
11
+ import re
7
12
  from pathlib import Path
8
13
  import datetime
14
+ from dataclasses import dataclass
9
15
 
16
+ # noinspection PyProtectedMember
10
17
  from simple_slurm import Slurm # type: ignore
18
+ from ataraxis_base_utilities import LogLevel, console
19
+
20
+
21
+ @dataclass
22
+ class _JupyterConnectionInfo:
23
+ """Stores the data used to establish the connection with a Jupyter notebook server running under SLURM control on a
24
+ remote Sun lab server.
25
+
26
+ More specifically, this class is used to transfer the connection metadata collected on the remote server back to
27
+ the local machine that requested the server to be established.
28
+ """
29
+
30
+ compute_node: str
31
+ """The hostname of the compute node where Jupyter is running."""
32
+
33
+ port: int
34
+ """The port number on which Jupyter is listening for communication. Usually, this is the default port 8888 or 9999.
35
+ """
36
+
37
+ token: str
38
+ """The authentication token for the Jupyter server. This token is used to authenticate the user when establishing
39
+ communication with the Jupyter server."""
40
+
41
+ @property
42
+ def localhost_url(self) -> str:
43
+ """Returns the localhost URL for connecting to the server.
44
+
45
+ To use this URL, first set up an SSH tunnel to the server via the specific Jupyter communication port and the
46
+ remote server access credentials.
47
+ """
48
+ return f"http://localhost:{self.port}/?token={self.token}"
11
49
 
12
50
 
13
51
  class Job:
@@ -138,3 +176,193 @@ class Job:
138
176
 
139
177
  # Returns the script content to caller as a string
140
178
  return fixed_script_content
179
+
180
+
181
+ class JupyterJob(Job):
182
+ """Specialized Job instance designed to launch a Jupyter notebook server on SLURM.
183
+
184
+ This class extends the base Job class to include Jupyter-specific configuration and commands for starting a
185
+ notebook server in a SLURM environment. Using this specialized job allows users to set up remote Jupyter servers
186
+ while still benefitting from SLURM's job management and fair airtime policies.
187
+
188
+ Notes:
189
+ Jupyter servers directly compete for resources with headless data processing jobs. Therefore, it is important
190
+ to minimize the resource footprint and the runtime of each Jupyter server, if possible.
191
+
192
+ Args:
193
+ job_name: The descriptive name of the Jupyter SLURM job to be created. Primarily, this name is used in terminal
194
+ printouts to identify the job to human operators.
195
+ output_log: The absolute path to the .txt file on the processing server, where to store the standard output
196
+ data of the job.
197
+ error_log: The absolute path to the .txt file on the processing server, where to store the standard error
198
+ data of the job.
199
+ working_directory: The absolute path to the directory where temporary job files will be stored. During runtime,
200
+ classes from this library use that directory to store files such as the job's shell script. All such files
201
+ are automatically removed from the directory at the end of a non-errors runtime.
202
+ conda_environment: The name of the conda environment to activate on the server before running the job logic. The
203
+ environment should contain the necessary Python packages and CLIs to support running the job's logic. For
204
+ Jupyter jobs, this necessarily includes the Jupyter notebook and jupyterlab packages.
205
+ port: The connection port number for Jupyter server. Do not change the default value unless you know what you
206
+ are doing, as the server has most common communication ports closed for security reasons.
207
+ notebook_directory: The directory to use as Jupyter's root. During runtime, Jupyter will only have access to
208
+ items stored in or under this directory. For most runtimes, this should be set to the user's root data or
209
+ working directory.
210
+ cpus_to_use: The number of CPUs to allocate to the Jupyter server. Keep this value as small as possible to avoid
211
+ interfering with headless data processing jobs.
212
+ ram_gb: The amount of RAM, in GB, to allocate to the Jupyter server. Keep this value as small as possible to
213
+ avoid interfering with headless data processing jobs.
214
+ time_limit: The maximum Jupyter server uptime, in minutes. Set this to the expected duration of your jupyter
215
+ session.
216
+ jupyter_args: Stores additional arguments to pass to jupyter notebook initialization command.
217
+
218
+ Attributes:
219
+ port: Stores the connection port of the managed Jupyter server.
220
+ notebook_dir: Stores the absolute path to the directory used as Jupyter's root, relative to the remote server
221
+ root.
222
+ connection_info: Stores the JupyterConnectionInfo instance after the Jupyter server is instantiated.
223
+ host: Stores the hostname of the remote server.
224
+ user: Stores the username used to connect with the remote server.
225
+ connection_info_file: The absolute path to the file that stores connection information, relative to the remote
226
+ server root.
227
+ _command: Stores the shell command for launching the Jupyter server.
228
+ """
229
+
230
+ def __init__(
231
+ self,
232
+ job_name: str,
233
+ output_log: Path,
234
+ error_log: Path,
235
+ working_directory: Path,
236
+ conda_environment: str,
237
+ notebook_directory: Path,
238
+ port: int = 9999, # Defaults to using port 9999
239
+ cpus_to_use: int = 2, # Defaults to 2 CPU cores
240
+ ram_gb: int = 32, # Defaults to 32 GB of RAM
241
+ time_limit: int = 120, # Defaults to 2 hours of runtime (120 minutes)
242
+ jupyter_args: str = "",
243
+ ) -> None:
244
+ # Initializes parent Job class
245
+ super().__init__(
246
+ job_name=job_name,
247
+ output_log=output_log,
248
+ error_log=error_log,
249
+ working_directory=working_directory,
250
+ conda_environment=conda_environment,
251
+ cpus_to_use=cpus_to_use,
252
+ ram_gb=ram_gb,
253
+ time_limit=time_limit,
254
+ )
255
+
256
+ # Saves important jupyter configuration parameters to class attributes
257
+ self.port = port
258
+ self.notebook_dir = notebook_directory
259
+
260
+ # Similar to job ID, these attributes initialize to None and are reconfigured as part of the job submission
261
+ # process.
262
+ self.connection_info: _JupyterConnectionInfo | None = None
263
+ self.host: str | None = None
264
+ self.user: str | None = None
265
+
266
+ # Resolves the server-side path to the jupyter server connection info file.
267
+ self.connection_info_file = working_directory.joinpath(f"{job_name}_connection.txt")
268
+
269
+ # Builds Jupyter launch command.
270
+ self._build_jupyter_command(jupyter_args)
271
+
272
+ def _build_jupyter_command(self, jupyter_args: str) -> None:
273
+ """Builds the command to launch Jupyter notebook server on the remote Sun lab server."""
274
+
275
+ # Gets the hostname of the compute node and caches it in the connection data file. Also caches the port name.
276
+ self.add_command('echo "COMPUTE_NODE: $(hostname)" > {}'.format(self.connection_info_file))
277
+ self.add_command('echo "PORT: {}" >> {}'.format(self.port, self.connection_info_file))
278
+
279
+ # Generates a random access token for security and caches it in the connection data file.
280
+ self.add_command("TOKEN=$(openssl rand -hex 24)")
281
+ self.add_command('echo "TOKEN: $TOKEN" >> {}'.format(self.connection_info_file))
282
+
283
+ # Builds Jupyter startup command.
284
+ jupyter_cmd = [
285
+ "jupyter lab",
286
+ "--no-browser",
287
+ f"--port={self.port}",
288
+ "--ip=0.0.0.0", # Listen on all interfaces
289
+ "--ServerApp.allow_origin='*'", # Allow connections from SSH tunnel
290
+ "--ServerApp.allow_remote_access=True", # Enable remote access
291
+ "--ServerApp.disable_check_xsrf=True", # Helps with proxy connections
292
+ f"--ServerApp.root_dir={self.notebook_dir}", # Root directory (not notebook-dir)
293
+ "--IdentityProvider.token=$TOKEN", # Token authentication
294
+ ]
295
+
296
+ # Adds any additional arguments.
297
+ if jupyter_args:
298
+ jupyter_cmd.append(jupyter_args)
299
+
300
+ # Adds resolved jupyter command to the list of job commands.
301
+ jupyter_cmd_str = " ".join(jupyter_cmd)
302
+ self.add_command(jupyter_cmd_str)
303
+
304
+ def parse_connection_info(self, info_file: Path) -> None:
305
+ """Parses the connection information file created by the Jupyter job on the server.
306
+
307
+ Use this method to parse the connection file fetched from the server to finalize setting up the Jupyter
308
+ server job.
309
+
310
+ Args:
311
+ info_file: The path to the .txt file generated by the remote server that stores the Jupyter connection
312
+ information to be parsed.
313
+ """
314
+
315
+ with open(info_file, "r") as f:
316
+ content = f.read()
317
+
318
+ # Extracts information using regex
319
+ compute_node_match = re.search(r"COMPUTE_NODE: (.+)", content)
320
+ port_match = re.search(r"PORT: (\d+)", content)
321
+ token_match = re.search(r"TOKEN: (.+)", content)
322
+
323
+ if not all([compute_node_match, port_match, token_match]):
324
+ message = f"Could not parse connection information file for the Jupyter server job with id {self.job_id}."
325
+ console.error(message, ValueError)
326
+
327
+ # Stores extracted data inside connection_info attribute as a JupyterConnectionInfo instance.
328
+ self.connection_info = _JupyterConnectionInfo(
329
+ compute_node=compute_node_match.group(1).strip(), # type: ignore
330
+ port=int(port_match.group(1)), # type: ignore
331
+ token=token_match.group(1).strip(), # type: ignore
332
+ )
333
+
334
+ def print_connection_info(self) -> None:
335
+ """Constructs and displays the command to set up the SSH tunnel to the server and the link to the localhost
336
+ server view in the terminal.
337
+
338
+ The SSH command should be used via a separate terminal or subprocess call to establish the secure SSH tunnel to
339
+ the Jupyter server. Once the SSH tunnel is established, the printed localhost url can be used to view the
340
+ server from the local machine.
341
+ """
342
+
343
+ # If connection information is not available, there is nothing to print
344
+ if self.connection_info is None:
345
+ console.echo(
346
+ message=(
347
+ f"No connection information is available for the job {self.job_name}, which indicates that the job "
348
+ f"has not been submitted to the server. Submit the job for execution to the remote Sun lab server "
349
+ f"to generate the connection information"
350
+ ),
351
+ level=LogLevel.WARNING,
352
+ )
353
+ return # No connection information available, so does not proceed with printing.
354
+
355
+ # Prints generic connection details to terminal
356
+ console.echo(f"Jupyter is running on: {self.connection_info.compute_node}")
357
+ console.echo(f"Port: {self.connection_info.port}")
358
+ console.echo(f"Token: {self.connection_info.token}")
359
+
360
+ # Constructs and displays the SSH tunnel command and the localhost url for connecting to the server
361
+ tunnel_cmd = (
362
+ f"ssh -N -L {self.connection_info.port}:{self.connection_info.compute_node}:{self.connection_info.port} "
363
+ f"{self.user}@{self.host}"
364
+ )
365
+ localhost_url = f"http://localhost:{self.connection_info.port}/?token={self.connection_info.token}"
366
+ print(f"\nTo access locally, run this in a terminal:")
367
+ print(tunnel_cmd)
368
+ print(f"\nThen open: {localhost_url}")