sl-shared-assets 2.0.1__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sl-shared-assets might be problematic. Click here for more details.

@@ -4,20 +4,22 @@ the running job status. All lab processing and analysis pipelines use this inter
4
4
  resources.
5
5
  """
6
6
 
7
- import time
7
+ from random import randint
8
8
  from pathlib import Path
9
9
  import tempfile
10
- from dataclasses import dataclass
10
+ from dataclasses import field, dataclass
11
11
 
12
12
  import paramiko
13
13
 
14
14
  # noinspection PyProtectedMember
15
15
  from simple_slurm import Slurm # type: ignore
16
+ from ataraxis_time import PrecisionTimer
16
17
  from paramiko.client import SSHClient
17
18
  from ataraxis_base_utilities import LogLevel, console
18
19
  from ataraxis_data_structures import YamlConfig
20
+ from ataraxis_time.time_helpers import get_timestamp
19
21
 
20
- from .job import Job
22
+ from .job import Job, JupyterJob
21
23
 
22
24
 
23
25
  def generate_server_credentials(
@@ -25,30 +27,36 @@ def generate_server_credentials(
25
27
  username: str,
26
28
  password: str,
27
29
  host: str = "cbsuwsun.biohpc.cornell.edu",
28
- raw_data_root: str = "/workdir/sun_data",
29
- processed_data_root: str = "/storage/sun_data",
30
+ storage_root: str = "/local/workdir",
31
+ working_root: str = "/local/storage",
32
+ shared_directory_name: str = "sun_data",
30
33
  ) -> None:
31
34
  """Generates a new server_credentials.yaml file under the specified directory, using input information.
32
35
 
33
36
  This function provides a convenience interface for generating new BioHPC server credential files. Generally, this is
34
- only used when setting up new host-computers in the lab.
37
+ only used when setting up new host-computers or users in the lab.
35
38
 
36
39
  Args:
37
40
  output_directory: The directory where to save the generated server_credentials.yaml file.
38
41
  username: The username to use for server authentication.
39
42
  password: The password to use for server authentication.
40
43
  host: The hostname or IP address of the server to connect to.
41
- raw_data_root: The path to the root directory used to store the raw data from all Sun lab projects on the
42
- server.
43
- processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects
44
- on the server.
44
+ storage_root: The path to the root storage (slow) server directory. Typically, this is the path to the
45
+ top-level (root) directory of the HDD RAID volume.
46
+ working_root: The path to the root working (fast) server directory. Typically, this is the path to the
47
+ top-level (root) directory of the NVME RAID volume. If the server uses the same volume for both storage and
48
+ working directories, enter the same path under both 'storage_root' and 'working_root'.
49
+ shared_directory_name: The name of the shared directory used to store all Sun lab project data on the storage
50
+ and working server volumes.
45
51
  """
52
+ # noinspection PyArgumentList
46
53
  ServerCredentials(
47
54
  username=username,
48
55
  password=password,
49
56
  host=host,
50
- raw_data_root=raw_data_root,
51
- processed_data_root=processed_data_root,
57
+ storage_root=storage_root,
58
+ working_root=working_root,
59
+ shared_directory_name=shared_directory_name,
52
60
  ).to_yaml(file_path=output_directory.joinpath("server_credentials.yaml"))
53
61
 
54
62
 
@@ -68,11 +76,37 @@ class ServerCredentials(YamlConfig):
68
76
  """The password to use for server authentication."""
69
77
  host: str = "cbsuwsun.biohpc.cornell.edu"
70
78
  """The hostname or IP address of the server to connect to."""
71
- raw_data_root: str = "/workdir/sun_data"
79
+ storage_root: str = "/local/storage"
80
+ """The path to the root storage (slow) server directory. Typically, this is the path to the top-level (root)
81
+ directory of the HDD RAID volume."""
82
+ working_root: str = "/local/workdir"
83
+ """The path to the root working (fast) server directory. Typically, this is the path to the top-level (root)
84
+ directory of the NVME RAID volume. If the server uses the same volume for both storage and working directories,
85
+ enter the same path under both 'storage_root' and 'working_root'."""
86
+ shared_directory_name: str = "sun_data"
87
+ """Stores the name of the shared directory used to store all Sun lab project data on the storage and working
88
+ server volumes."""
89
+ raw_data_root: str = field(init=False, default_factory=lambda: "/local/storage/sun_data")
72
90
  """The path to the root directory used to store the raw data from all Sun lab projects on the target server."""
73
- processed_data_root: str = "/storage/sun_data"
91
+ processed_data_root: str = field(init=False, default_factory=lambda: "/local/workdir/sun_data")
74
92
  """The path to the root directory used to store the processed data from all Sun lab projects on the target
75
93
  server."""
94
+ user_data_root: str = field(init=False, default_factory=lambda: "/local/storage/YourNetID")
95
+ """The path to the root directory of the user on the target server. Unlike raw and processed data roots, which are
96
+ shared between all Sun lab users, each user_data directory is unique for every server user."""
97
+ user_working_root: str = field(init=False, default_factory=lambda: "/local/workdir/YourNetID")
98
+ """The path to the root user working directory on the target server. This directory is unique for every user."""
99
+
100
+ def __post_init__(self) -> None:
101
+ """Statically resolves the paths to end-point directories using provided root directories."""
102
+
103
+ # Shared Sun Lab directories statically use 'sun_data' root names
104
+ self.raw_data_root = str(Path(self.storage_root).joinpath(self.shared_directory_name))
105
+ self.processed_data_root = str(Path(self.working_root).joinpath(self.shared_directory_name))
106
+
107
+ # User directories exist at the same level as the 'shared' root project directories, but user user-ids as names.
108
+ self.user_data_root = str(Path(self.storage_root).joinpath(f"{self.username}"))
109
+ self.user_working_root = str(Path(self.working_root).joinpath(f"{self.username}"))
76
110
 
77
111
 
78
112
  class Server:
@@ -105,6 +139,9 @@ class Server:
105
139
  # Loads the credentials from the provided .yaml file
106
140
  self._credentials: ServerCredentials = ServerCredentials.from_yaml(credentials_path) # type: ignore
107
141
 
142
+ # Initializes a timer class to optionally delay loop cycling below
143
+ timer = PrecisionTimer("s")
144
+
108
145
  # Establishes the SSH connection to the specified processing server. At most, attempts to connect to the server
109
146
  # 30 times before terminating with an error
110
147
  attempt = 0
@@ -135,17 +172,140 @@ class Server:
135
172
  raise RuntimeError
136
173
 
137
174
  console.echo(
138
- f"Could not SSH to {self._credentials.host}, retrying after a 2-second delay...",
175
+ f"Could not SSH into {self._credentials.host}, retrying after a 2-second delay...",
139
176
  level=LogLevel.WARNING,
140
177
  )
141
178
  attempt += 1
142
- time.sleep(2)
179
+ timer.delay_noblock(delay=2, allow_sleep=True)
143
180
 
144
181
  def __del__(self) -> None:
145
182
  """If the instance is connected to the server, terminates the connection before the instance is destroyed."""
146
183
  self.close()
147
184
 
148
- def submit_job(self, job: Job) -> Job:
185
+ def create_job(
186
+ self,
187
+ job_name: str,
188
+ conda_environment: str,
189
+ cpus_to_use: int = 10,
190
+ ram_gb: int = 10,
191
+ time_limit: int = 60,
192
+ ) -> Job:
193
+ """Creates and returns a new Job instance.
194
+
195
+ Use this method to generate Job objects for all headless jobs that need to be run on the remote server. The
196
+ generated Job is a precursor that requires further configuration by the user before it can be submitted to the
197
+ server for execution.
198
+
199
+ Args:
200
+ job_name: The descriptive name of the SLURM job to be created. Primarily, this name is used in terminal
201
+ printouts to identify the job to human operators.
202
+ conda_environment: The name of the conda environment to activate on the server before running the job logic.
203
+ The environment should contain the necessary Python packages and CLIs to support running the job's
204
+ logic.
205
+ cpus_to_use: The number of CPUs to use for the job.
206
+ ram_gb: The amount of RAM to allocate for the job, in Gigabytes.
207
+ time_limit: The maximum time limit for the job, in minutes. If the job is still running at the end of this
208
+ time period, it will be forcibly terminated. It is highly advised to always set adequate maximum runtime
209
+ limits to prevent jobs from hogging the server in case of runtime or algorithm errors.
210
+
211
+ Returns:
212
+ The initialized Job instance pre-filled with SLURM configuration data and conda activation commands. Modify
213
+ the returned instance with any additional commands as necessary for the job to fulfill its intended
214
+ purpose. Note, the Job requires submission via submit_job() to be executed by the server.
215
+ """
216
+ # Statically configures the working directory to be stored under:
217
+ # user working root / job_logs / job_name_timestamp
218
+ timestamp = get_timestamp()
219
+ working_directory = Path(self.user_working_root.joinpath("job_logs", f"{job_name}_{timestamp}"))
220
+ self.create_directory(remote_path=working_directory, parents=True)
221
+
222
+ return Job(
223
+ job_name=job_name,
224
+ output_log=working_directory.joinpath("stdout.txt"),
225
+ error_log=working_directory.joinpath("stderr.txt"),
226
+ working_directory=working_directory,
227
+ conda_environment=conda_environment,
228
+ cpus_to_use=cpus_to_use,
229
+ ram_gb=ram_gb,
230
+ time_limit=time_limit,
231
+ )
232
+
233
+ def launch_jupyter_server(
234
+ self,
235
+ job_name: str,
236
+ conda_environment: str,
237
+ notebook_directory: Path,
238
+ cpus_to_use: int = 2,
239
+ ram_gb: int = 32,
240
+ time_limit: int = 240,
241
+ port: int = 0,
242
+ jupyter_args: str = "",
243
+ ) -> JupyterJob:
244
+ """Launches a Jupyter notebook server on the target remote Sun lab server.
245
+
246
+ Use this method to run interactive Jupyter sessions on the remote server under SLURM control. Unlike the
247
+ create_job(), this method automatically submits the job for execution as part of its runtime. Therefore, the
248
+ returned JupyterJob instance should only be used to query information about how to connect to the remote
249
+ Jupyter server.
250
+
251
+ Args:
252
+ job_name: The descriptive name of the Jupyter SLURM job to be created. Primarily, this name is used in
253
+ terminal printouts to identify the job to human operators.
254
+ conda_environment: The name of the conda environment to activate on the server before running the job logic.
255
+ The environment should contain the necessary Python packages and CLIs to support running the job's
256
+ logic. For Jupyter jobs, this necessarily includes the Jupyter notebook and jupyterlab packages.
257
+ port: The connection port number for Jupyter server. If set to 0 (default), a random port number between
258
+ 8888 and 9999 will be assigned to this connection to reduce the possibility of colliding with other
259
+ user sessions.
260
+ notebook_directory: The directory to use as Jupyter's root. During runtime, Jupyter will only have GUI
261
+ access to items stored in or under this directory. For most runtimes, this should be set to the user's
262
+ root data or working directory.
263
+ cpus_to_use: The number of CPUs to allocate to the Jupyter server. Keep this value as small as possible to
264
+ avoid interfering with headless data processing jobs.
265
+ ram_gb: The amount of RAM, in GB, to allocate to the Jupyter server. Keep this value as small as possible to
266
+ avoid interfering with headless data processing jobs.
267
+ time_limit: The maximum Jupyter server uptime, in minutes. Set this to the expected duration of your jupyter
268
+ session.
269
+ jupyter_args: Stores additional arguments to pass to jupyter notebook initialization command.
270
+
271
+ Returns:
272
+ The initialized JupyterJob instance that stores information on how to connect to the created Jupyter server.
273
+ Do NOT re-submit the job to the server, as this is done as part of this method's runtime.
274
+
275
+ Raises:
276
+ TimeoutError: If the target Jupyter server doesn't start within 120 minutes from this method being called.
277
+ RuntimeError: If job submission fails for any reason.
278
+ """
279
+
280
+ # Statically configures the working directory to be stored under:
281
+ # user working root / job_logs / job_name_timestamp
282
+ timestamp = get_timestamp()
283
+ working_directory = Path(self.user_working_root.joinpath("job_logs", f"{job_name}_{timestamp}"))
284
+ self.create_directory(remote_path=working_directory, parents=True)
285
+
286
+ # If necessary, generates and sets port to a random value between 8888 and 9999.
287
+ if port == 0:
288
+ port = randint(8888, 9999)
289
+
290
+ job = JupyterJob(
291
+ job_name=job_name,
292
+ output_log=working_directory.joinpath("stdout.txt"),
293
+ error_log=working_directory.joinpath("stderr.txt"),
294
+ working_directory=working_directory,
295
+ conda_environment=conda_environment,
296
+ notebook_directory=notebook_directory,
297
+ port=port,
298
+ cpus_to_use=cpus_to_use,
299
+ ram_gb=ram_gb,
300
+ time_limit=time_limit,
301
+ jupyter_args=jupyter_args,
302
+ )
303
+
304
+ # Submits the job to the server and, if submission is successful, returns the JupyterJob object extended to
305
+ # include connection data received from the server.
306
+ return self.submit_job(job) # type: ignore[return-value]
307
+
308
+ def submit_job(self, job: Job | JupyterJob) -> Job | JupyterJob:
149
309
  """Submits the input job to the managed BioHPC server via SLURM job manager.
150
310
 
151
311
  This method submits various jobs for execution via SLURM-managed BioHPC cluster. As part of its runtime, the
@@ -162,6 +322,7 @@ class Server:
162
322
  Raises:
163
323
  RuntimeError: If job submission to the server fails.
164
324
  """
325
+ console.echo(message=f"Submitting '{job.job_name}' job to the remote server {self.host}...")
165
326
 
166
327
  # Generates a temporary shell script on the local machine. Uses tempfile to automatically remove the
167
328
  # local script as soon as it is uploaded to the server.
@@ -197,9 +358,62 @@ class Server:
197
358
  # Job object
198
359
  job_id = job_output.split()[-1]
199
360
  job.job_id = job_id
361
+
362
+ # Special processing for Jupyter jobs
363
+ if isinstance(job, JupyterJob):
364
+ # Transfers host and user information to the JupyterJob object
365
+ job.host = self.host
366
+ job.user = self.user
367
+
368
+ # Initializes a timer class to optionally delay loop cycling below
369
+ timer = PrecisionTimer("s")
370
+
371
+ timer.reset()
372
+ while timer.elapsed < 120: # Waits for at most 2 minutes before terminating with an error
373
+ # Checks if the connection info file exists
374
+ try:
375
+ # Pulls the connection info file
376
+ local_info_file = Path(f"/tmp/{job.job_name}_connection.txt")
377
+ self.pull_file(local_file_path=local_info_file, remote_file_path=job.connection_info_file)
378
+
379
+ # Parses connection data from the file and caches it inside Job class attributes
380
+ job.parse_connection_info(local_info_file)
381
+
382
+ # Removes the local file copy after it is parsed
383
+ local_info_file.unlink(missing_ok=True)
384
+
385
+ # Also removes the remote copy once the runtime is over
386
+ self.remove(remote_path=job.connection_info_file, is_dir=False)
387
+
388
+ # Breaks the waiting loop
389
+ break
390
+
391
+ except Exception:
392
+ # The file doesn't exist yet or job initialization failed
393
+ if self.job_complete(job):
394
+ message = (
395
+ f"Remote jupyter server job {job.job_name} with id {job.job_id} encountered a startup and "
396
+ f"was terminated prematurely."
397
+ )
398
+ console.error(message, RuntimeError)
399
+
400
+ timer.delay_noblock(delay=5, allow_sleep=True) # Waits for 5 seconds before checking again
401
+ else:
402
+ # Only raises timeout error if the while loop is not broken in 120 seconds
403
+ message = (
404
+ f"Remote jupyter server job {job.job_name} with id {job.job_id} did not start within 120 seconds "
405
+ f"from being submitted. Since all jupyter jobs are intended to be interactive and the server is "
406
+ f"busy running other jobs, this job is cancelled. Try again when the server is less busy."
407
+ )
408
+ console.error(message, TimeoutError)
409
+ raise TimeoutError(message) # Fallback to appease mypy
410
+
411
+ console.echo(message=f"{job.job_name} job: Submitted to {self.host}.", level=LogLevel.SUCCESS)
412
+
413
+ # Returns the updated job object
200
414
  return job
201
415
 
202
- def job_complete(self, job: Job) -> bool:
416
+ def job_complete(self, job: Job | JupyterJob) -> bool:
203
417
  """Returns True if the job managed by the input Job instance has been completed or terminated its runtime due
204
418
  to an error.
205
419
 
@@ -228,6 +442,24 @@ class Server:
228
442
  else:
229
443
  return False
230
444
 
445
+ def abort_job(self, job: Job | JupyterJob) -> None:
446
+ """Aborts the target job if it is currently running on the server.
447
+
448
+ Use this method to immediately abort running or queued jobs, without waiting for the timeout guard. If the job
449
+ is queued, this method will remove it from the SLURM queue. If the job is already terminated, this method will
450
+ do nothing.
451
+
452
+ Args:
453
+ job: The Job object that needs to be aborted.
454
+ """
455
+
456
+ # Sends the 'scancel' command to the server targeting the specific Job via ID, unless the job is already
457
+ # complete
458
+ if not self.job_complete(job):
459
+ self._client.exec_command(f"scancel {job.job_id}")
460
+
461
+ console.echo(message=f"{job.job_name} job: Aborted.", level=LogLevel.SUCCESS)
462
+
231
463
  def pull_file(self, local_file_path: Path, remote_file_path: Path) -> None:
232
464
  """Moves the specified file from the remote server to the local machine.
233
465
 
@@ -236,8 +468,10 @@ class Server:
236
468
  remote_file_path: The path to the target file on the remote server (the file to be copied).
237
469
  """
238
470
  sftp = self._client.open_sftp()
239
- sftp.get(localpath=local_file_path, remotepath=str(remote_file_path))
240
- sftp.close()
471
+ try:
472
+ sftp.get(localpath=local_file_path, remotepath=str(remote_file_path))
473
+ finally:
474
+ sftp.close()
241
475
 
242
476
  def push_file(self, local_file_path: Path, remote_file_path: Path) -> None:
243
477
  """Moves the specified file from the remote server to the local machine.
@@ -247,8 +481,10 @@ class Server:
247
481
  remote_file_path: The path to the file on the remote server (where to copy the file).
248
482
  """
249
483
  sftp = self._client.open_sftp()
250
- sftp.put(localpath=local_file_path, remotepath=str(remote_file_path))
251
- sftp.close()
484
+ try:
485
+ sftp.put(localpath=local_file_path, remotepath=str(remote_file_path))
486
+ finally:
487
+ sftp.close()
252
488
 
253
489
  def remove(self, remote_path: Path, is_dir: bool) -> None:
254
490
  """Removes the specified file or directory from the remote server.
@@ -258,11 +494,87 @@ class Server:
258
494
  is_dir: Determines whether the input path represents a directory or a file.
259
495
  """
260
496
  sftp = self._client.open_sftp()
261
- if is_dir:
262
- sftp.rmdir(path=str(remote_path))
263
- else:
264
- sftp.unlink(path=str(remote_path))
265
- sftp.close()
497
+ try:
498
+ if is_dir:
499
+ sftp.rmdir(path=str(remote_path))
500
+ else:
501
+ sftp.unlink(path=str(remote_path))
502
+ finally:
503
+ sftp.close()
504
+
505
+ def create_directory(self, remote_path: Path, parents: bool = True) -> None:
506
+ """Creates the specified directory tree on the managed remote server via SFTP.
507
+
508
+ This method creates directories on the remote server, with options to create parent directories and handle
509
+ existing directories gracefully.
510
+
511
+ Args:
512
+ remote_path: The absolute path to the directory to create on the remote server, relative to the server
513
+ root.
514
+ parents: Determines whether to create parent directories, if they are missing. Otherwise, if parents do not
515
+ exist, raises a FileNotFoundError.
516
+
517
+ Notes:
518
+ This method silently assumes that it is fine if the directory already exists and treats it as a successful
519
+ runtime end-point.
520
+ """
521
+ sftp = self._client.open_sftp()
522
+
523
+ try:
524
+ # Converts the target path to string for SFTP operations
525
+ remote_path_str = str(remote_path)
526
+
527
+ if parents:
528
+ # Creates parent directories if needed:
529
+ # Split the path into parts and create each level
530
+ path_parts = Path(remote_path_str).parts
531
+ current_path = ""
532
+
533
+ for part in path_parts:
534
+ # Skips empty path parts
535
+ if not part:
536
+ continue
537
+
538
+ if current_path:
539
+ # Keeps stacking path components on top of the current_path object
540
+ current_path = str(Path(current_path).joinpath(part))
541
+ else:
542
+ # Initially, the current path is empty, so it is set to the first part
543
+ current_path = part
544
+
545
+ try:
546
+ # Checks if directory exists by trying to stat it
547
+ sftp.stat(current_path)
548
+ except FileNotFoundError:
549
+ # If the directory does not exist, creates it
550
+ sftp.mkdir(current_path)
551
+ else:
552
+ # Otherwise, only creates the final directory
553
+ try:
554
+ # Checks if directory already exists
555
+ sftp.stat(remote_path_str)
556
+ except FileNotFoundError:
557
+ # Creates the directory if it does not exist
558
+ sftp.mkdir(remote_path_str)
559
+
560
+ # Ensures sftp connection is closed.
561
+ finally:
562
+ sftp.close()
563
+
564
+ def exists(self, remote_path: Path) -> bool:
565
+ """Returns True if the target file or directory exists on the remote server."""
566
+
567
+ sftp = self._client.open_sftp()
568
+ try:
569
+ # Checks if the target file or directory exists by trying to stat it
570
+ sftp.stat(str(remote_path))
571
+
572
+ # If the request does not err, returns True (file or directory exists)
573
+ return True
574
+
575
+ # If the directory or file does not exist, returns False
576
+ except FileNotFoundError:
577
+ return False
266
578
 
267
579
  def close(self) -> None:
268
580
  """Closes the SSH connection to the server.
@@ -274,15 +586,37 @@ class Server:
274
586
  self._client.close()
275
587
 
276
588
  @property
277
- def raw_data_root(self) -> str:
589
+ def raw_data_root(self) -> Path:
278
590
  """Returns the absolute path to the directory used to store the raw data for all Sun lab projects on the server
279
591
  accessible through this class.
280
592
  """
281
- return self._credentials.raw_data_root
593
+ return Path(self._credentials.raw_data_root)
282
594
 
283
595
  @property
284
- def processed_data_root(self) -> str:
596
+ def processed_data_root(self) -> Path:
285
597
  """Returns the absolute path to the directory used to store the processed data for all Sun lab projects on the
286
598
  server accessible through this class.
287
599
  """
288
- return self._credentials.processed_data_root
600
+ return Path(self._credentials.processed_data_root)
601
+
602
+ @property
603
+ def user_data_root(self) -> Path:
604
+ """Returns the absolute path to the directory used to store user-specific data on the server accessible through
605
+ this class."""
606
+ return Path(self._credentials.user_data_root)
607
+
608
+ @property
609
+ def user_working_root(self) -> Path:
610
+ """Returns the absolute path to the user-specific working (fast) directory on the server accessible through
611
+ this class."""
612
+ return Path(self._credentials.user_working_root)
613
+
614
+ @property
615
+ def host(self) -> str:
616
+ """Returns the hostname or IP address of the server accessible through this class."""
617
+ return self._credentials.host
618
+
619
+ @property
620
+ def user(self) -> str:
621
+ """Returns the username used to authenticate with the server."""
622
+ return self._credentials.username