slurmray 3.5.9__py3-none-any.whl → 3.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of slurmray might be problematic. Click here for more details.

slurmray/RayLauncher.py CHANGED
@@ -87,17 +87,19 @@ class RayLauncher:
87
87
  self.__write_python_script()
88
88
  self.script_file, self.job_name = self.__write_slurm_script()
89
89
 
90
- def __call__(self, cancel_old_jobs: bool = True) -> Any:
90
+ def __call__(self, cancel_old_jobs: bool = True, serialize: bool = True) -> Any:
91
91
  """Launch the job and return the result
92
92
 
93
93
  Args:
94
94
  cancel_old_jobs (bool, optional): Cancel the old jobs. Defaults to True.
95
+ serialize (bool, optional): Serialize the function and the arguments. This should be set to False if the function is automatically called by the server. Defaults to True.
95
96
 
96
97
  Returns:
97
98
  Any: Result of the function
98
99
  """
99
100
  # Sereialize function and arguments
100
- self.__serialize_func_and_args(self.func, self.args)
101
+ if serialize:
102
+ self.__serialize_func_and_args(self.func, self.args)
101
103
 
102
104
  if self.cluster:
103
105
  print("Cluster detected, running on cluster...")
@@ -168,11 +170,6 @@ class RayLauncher:
168
170
  """
169
171
  print("Serializing function and arguments...")
170
172
 
171
- # Remove the old python script
172
- for file in os.listdir(self.project_path):
173
- if file.endswith(".pkl"):
174
- os.remove(os.path.join(self.project_path, file))
175
-
176
173
  # Pickle the function
177
174
  with open(os.path.join(self.project_path, "func.pkl"), "wb") as f:
178
175
  dill.dump(func, f)
@@ -299,9 +296,7 @@ class RayLauncher:
299
296
 
300
297
  # Wait for log file to be created
301
298
  current_queue = None
302
- queue_log_file = os.path.join(
303
- self.project_path, "{}_queue.log".format(job_name)
304
- )
299
+ queue_log_file = os.path.join(self.project_path, "queue.log")
305
300
  with open(queue_log_file, "w") as f:
306
301
  f.write("")
307
302
  print(
@@ -312,6 +307,7 @@ class RayLauncher:
312
307
  subprocess.Popen(
313
308
  ["tail", "-f", os.path.join(self.project_path, "{}.log".format(job_name))]
314
309
  )
310
+ start_time = time.time()
315
311
  while True:
316
312
  time.sleep(0.25)
317
313
  if os.path.exists(
@@ -361,6 +357,12 @@ class RayLauncher:
361
357
  node_list,
362
358
  )
363
359
  )[1:]
360
+
361
+ # Update the queue log
362
+ if time.time() - start_time > 60:
363
+ start_time = time.time()
364
+ print("Update time: {}".format(time.strftime("%H:%M:%S")))
365
+
364
366
  if current_queue is None or current_queue != to_queue:
365
367
  current_queue = to_queue
366
368
  with open(queue_log_file, "w") as f:
@@ -373,7 +375,11 @@ class RayLauncher:
373
375
  text += "\n"
374
376
  f.write(text)
375
377
 
378
+ # Print the queue
379
+ print(text)
380
+
376
381
  # Wait for the job to finish while printing the log
382
+ print("Job started! Waiting for the job to finish...")
377
383
  log_cursor_position = 0
378
384
  job_finished = False
379
385
  while not job_finished:
@@ -407,7 +413,7 @@ class RayLauncher:
407
413
  if self.server_password is None:
408
414
  # Add ssh key
409
415
  self.server_password = getpass("Enter your cluster password: ")
410
-
416
+
411
417
  ssh_client.connect(
412
418
  hostname=self.server_ssh,
413
419
  username=self.server_username,
@@ -440,11 +446,12 @@ class RayLauncher:
440
446
  # lines = [re.sub(r'bitsandbytes\n', 'bitsandbytes --global-option="--cuda_ext"\n', line) for line in lines]
441
447
  lines = [re.sub(r"slurmray\n", "", line) for line in lines]
442
448
  # Add slurmray --pre
443
- lines.append("slurmray --pre\n")
449
+ lines.append("slurmray --pre \n")
444
450
  # Solve torch buf (https://github.com/pytorch/pytorch/issues/111469)
445
451
  if "torchaudio\n" or "torchvision\n" in lines:
446
- lines.append("torch==2.1.1\n")
447
- lines.append("--index-url https://download.pytorch.org/whl/cu121\n")
452
+ lines.append(
453
+ "torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121\n"
454
+ )
448
455
 
449
456
  with open(f"{self.project_path}/requirements.txt", "w") as file:
450
457
  file.writelines(lines)
@@ -484,13 +491,24 @@ class RayLauncher:
484
491
  break
485
492
  print(line, end="")
486
493
 
494
+ stdout.channel.recv_exit_status()
495
+
487
496
  # Downloading result
488
497
  print("Downloading result...")
489
- sftp.get(
490
- "slurmray-server/.slogs/server/result.pkl",
491
- os.path.join(self.project_path, "result.pkl"),
492
- )
493
- print("Result downloaded!")
498
+ try:
499
+ sftp.get(
500
+ "slurmray-server/.slogs/server/result.pkl",
501
+ os.path.join(self.project_path, "result.pkl"),
502
+ )
503
+ print("Result downloaded!")
504
+ except FileNotFoundError:
505
+ # Check for errors
506
+ stderr_lines = stderr.readlines()
507
+ if stderr_lines:
508
+ print("\nErrors:\n")
509
+ for line in stderr_lines:
510
+ print(line, end="")
511
+ print("An error occured, please check the logs.")
494
512
 
495
513
  def __write_server_script(self):
496
514
  """This funtion will write a script with the given specifications to run slurmray on the cluster"""
@@ -541,20 +559,24 @@ if __name__ == "__main__":
541
559
  return result
542
560
 
543
561
  launcher = RayLauncher(
544
- project_name="example", # Name of the project (will create a directory with this name in the current directory)
545
- func=example_func, # Function to execute
546
- args={"x": 1}, # Arguments of the function
547
- files=["slurmray/RayLauncher.py"], # List of files to push to the cluster (file path will be recreated on the cluster)
548
- modules=[], # List of modules to load on the curnagl Cluster (CUDA & CUDNN are automatically added if use_gpu=True)
549
- node_nbr=1, # Number of nodes to use
550
- use_gpu=True, # If you need A100 GPU, you can set it to True
551
- memory=8, # In MegaBytes
552
- max_running_time=5, # In minutes
553
- runtime_env={"env_vars": {"NCCL_SOCKET_IFNAME": "eno1"}}, # Example of environment variable
554
- server_run=True, # To run the code on the cluster and not locally
555
- server_ssh="curnagl.dcsr.unil.ch", # Address of the SLURM server
556
- server_username="hjamet", # Username to connect to the server
557
- server_password=None, # Will be asked in the terminal
562
+ project_name="example", # Name of the project (will create a directory with this name in the current directory)
563
+ func=example_func, # Function to execute
564
+ args={"x": 5}, # Arguments of the function
565
+ files=[
566
+ "slurmray/RayLauncher.py"
567
+ ], # List of files to push to the cluster (file path will be recreated on the cluster)
568
+ modules=[], # List of modules to load on the curnagl Cluster (CUDA & CUDNN are automatically added if use_gpu=True)
569
+ node_nbr=1, # Number of nodes to use
570
+ use_gpu=True, # If you need A100 GPU, you can set it to True
571
+ memory=8, # In MegaBytes
572
+ max_running_time=5, # In minutes
573
+ runtime_env={
574
+ "env_vars": {"NCCL_SOCKET_IFNAME": "eno1"}
575
+ }, # Example of environment variable
576
+ server_run=True, # To run the code on the cluster and not locally
577
+ server_ssh="curnagl.dcsr.unil.ch", # Address of the SLURM server
578
+ server_username="hjamet", # Username to connect to the server
579
+ server_password=None, # Will be asked in the terminal
558
580
  )
559
581
 
560
582
  result = launcher()
@@ -17,7 +17,34 @@ fi
17
17
  source .venv/bin/activate
18
18
 
19
19
  # Install requirements
20
- pip3 install -r requirements.txt
20
+ ## Load all installed packages into a variable
21
+ installed_packages=$(pip3 list --format=freeze)
22
+ ## Function to check if a package is installed
23
+ is_package_installed() {
24
+ package=$1
25
+ echo "$installed_packages" | grep -i "^$package==" &> /dev/null
26
+ return $?
27
+ }
28
+ ## Read the requirements.txt file line by line
29
+ while IFS= read -r package
30
+ do
31
+ # Check if the line is not empty
32
+ if [ -n "$package" ]; then
33
+ echo "Checking package: $package"
34
+ # Extract the package name without options
35
+ package_name=$(echo "$package" | awk '{print $1}' | cut -d'=' -f1)
36
+ if is_package_installed "$package_name"; then
37
+ echo "The package $package_name is already installed."
38
+ else
39
+ echo "Installing package: $package"
40
+ command="pip3 install $package"
41
+ eval "$command"
42
+ if [ $? -ne 0 ]; then
43
+ echo "Error while installing $package"
44
+ fi
45
+ fi
46
+ fi
47
+ done < "requirements.txt"
21
48
 
22
49
  # Fix torch bug (https://github.com/pytorch/pytorch/issues/111469)
23
50
  export LD_LIBRARY_PATH=$HOME/slurmray-server/.venv/lib/python3.9/site-packages/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
@@ -15,8 +15,5 @@ if __name__ == "__main__":
15
15
  server_ssh=None,
16
16
  server_username=None,
17
17
  )
18
-
19
- # Remove serialization
20
- launcher.__serialize_func_and_args = lambda *args, **kwargs : print("No serialization done.")
21
18
 
22
- result = launcher()
19
+ result = launcher(serialize=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: slurmray
3
- Version: 3.5.9
3
+ Version: 3.6.1
4
4
  Summary: SlurmRay is a module for effortlessly distributing tasks on a Slurm cluster using the Ray library.
5
5
  Home-page: https://henri-jamet.vercel.app/
6
6
  License: Apache License
@@ -23,7 +23,7 @@ Description-Content-Type: text/markdown
23
23
 
24
24
  # SLURM_RAY
25
25
 
26
- 👉[Full documentation](https://henri-jamet.vercel.app/cards/documentation/slurm-ray/slurm-ray/)
26
+ 👉[Full documentation](https://www.henri-jamet.com/docs/slurmray/slurm-ray/)
27
27
 
28
28
  ## Description
29
29
 
@@ -0,0 +1,10 @@
1
+ slurmray/RayLauncher.py,sha256=KfmlTBqfv3KFe1Kyk1n6lEZwJfLnZKQxKKpKlRNAre8,23452
2
+ slurmray/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ slurmray/assets/sbatch_template.sh,sha256=c-7J4ItzrctDrbF5Znu8p1d_xIgayC9puhjX3nLMzsk,2273
4
+ slurmray/assets/slurmray_server.sh,sha256=-PpX3AitLVfAYjyNqE3BjtDu5uvk11KoiaCUVgmtcEQ,1506
5
+ slurmray/assets/slurmray_server_template.py,sha256=JwX2pH7K9pb10JuRxe3EsjSUp1t3xm0ojEh8H2W2lVo,461
6
+ slurmray/assets/spython_template.py,sha256=kRUvNQs9iCcg0wJLmm9LV0TnbUdlenZMYPr_bZPkXLg,597
7
+ slurmray-3.6.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ slurmray-3.6.1.dist-info/METADATA,sha256=kqs_CKzsLCACuKE67WdILR7LosGAtTRe-xDez-Trozk,3530
9
+ slurmray-3.6.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
10
+ slurmray-3.6.1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- slurmray/RayLauncher.py,sha256=6ZS8o4CT2ulwTlmW0ahImNEHZy0h-srgNyJoxGew8lg,22617
2
- slurmray/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- slurmray/assets/sbatch_template.sh,sha256=c-7J4ItzrctDrbF5Znu8p1d_xIgayC9puhjX3nLMzsk,2273
4
- slurmray/assets/slurmray_server.sh,sha256=BpmyczNtMlsRimbUYU2XSE59YHGSsozo3rqSQcXNubQ,638
5
- slurmray/assets/slurmray_server_template.py,sha256=xjuF3nwvQONRxQNzXkAsgFkInY80y6ynkHE9zJjw0xk,575
6
- slurmray/assets/spython_template.py,sha256=kRUvNQs9iCcg0wJLmm9LV0TnbUdlenZMYPr_bZPkXLg,597
7
- slurmray-3.5.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
- slurmray-3.5.9.dist-info/METADATA,sha256=lbPv9a_Mn5nyXayOfrQka6rZG6QmtRzOVOsyjAxdJO4,3549
9
- slurmray-3.5.9.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
10
- slurmray-3.5.9.dist-info/RECORD,,