slurmray 3.5.9__tar.gz → 3.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of slurmray might be problematic. Click here for more details.
- {slurmray-3.5.9 → slurmray-3.6.1}/PKG-INFO +2 -2
- {slurmray-3.5.9 → slurmray-3.6.1}/README.md +1 -1
- {slurmray-3.5.9 → slurmray-3.6.1}/pyproject.toml +1 -1
- {slurmray-3.5.9 → slurmray-3.6.1}/slurmray/RayLauncher.py +55 -33
- slurmray-3.6.1/slurmray/assets/slurmray_server.sh +54 -0
- {slurmray-3.5.9 → slurmray-3.6.1}/slurmray/assets/slurmray_server_template.py +1 -4
- slurmray-3.5.9/slurmray/assets/slurmray_server.sh +0 -27
- {slurmray-3.5.9 → slurmray-3.6.1}/LICENSE +0 -0
- {slurmray-3.5.9 → slurmray-3.6.1}/slurmray/__init__.py +0 -0
- {slurmray-3.5.9 → slurmray-3.6.1}/slurmray/assets/sbatch_template.sh +0 -0
- {slurmray-3.5.9 → slurmray-3.6.1}/slurmray/assets/spython_template.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: slurmray
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.1
|
|
4
4
|
Summary: SlurmRay is a module for effortlessly distributing tasks on a Slurm cluster using the Ray library.
|
|
5
5
|
Home-page: https://henri-jamet.vercel.app/
|
|
6
6
|
License: Apache License
|
|
@@ -23,7 +23,7 @@ Description-Content-Type: text/markdown
|
|
|
23
23
|
|
|
24
24
|
# SLURM_RAY
|
|
25
25
|
|
|
26
|
-
👉[Full documentation](https://henri-jamet.
|
|
26
|
+
👉[Full documentation](https://www.henri-jamet.com/docs/slurmray/slurm-ray/)
|
|
27
27
|
|
|
28
28
|
## Description
|
|
29
29
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "slurmray"
|
|
3
|
-
version = "3.
|
|
3
|
+
version = "3.6.1"
|
|
4
4
|
description = "SlurmRay is a module for effortlessly distributing tasks on a Slurm cluster using the Ray library. "
|
|
5
5
|
authors = ["Henri Jamet <henri.jamet@unil.ch>"]
|
|
6
6
|
license = "Apache License"
|
|
@@ -87,17 +87,19 @@ class RayLauncher:
|
|
|
87
87
|
self.__write_python_script()
|
|
88
88
|
self.script_file, self.job_name = self.__write_slurm_script()
|
|
89
89
|
|
|
90
|
-
def __call__(self, cancel_old_jobs: bool = True) -> Any:
|
|
90
|
+
def __call__(self, cancel_old_jobs: bool = True, serialize: bool = True) -> Any:
|
|
91
91
|
"""Launch the job and return the result
|
|
92
92
|
|
|
93
93
|
Args:
|
|
94
94
|
cancel_old_jobs (bool, optional): Cancel the old jobs. Defaults to True.
|
|
95
|
+
serialize (bool, optional): Serialize the function and the arguments. This should be set to False if the function is automatically called by the server. Defaults to True.
|
|
95
96
|
|
|
96
97
|
Returns:
|
|
97
98
|
Any: Result of the function
|
|
98
99
|
"""
|
|
99
100
|
# Sereialize function and arguments
|
|
100
|
-
|
|
101
|
+
if serialize:
|
|
102
|
+
self.__serialize_func_and_args(self.func, self.args)
|
|
101
103
|
|
|
102
104
|
if self.cluster:
|
|
103
105
|
print("Cluster detected, running on cluster...")
|
|
@@ -168,11 +170,6 @@ class RayLauncher:
|
|
|
168
170
|
"""
|
|
169
171
|
print("Serializing function and arguments...")
|
|
170
172
|
|
|
171
|
-
# Remove the old python script
|
|
172
|
-
for file in os.listdir(self.project_path):
|
|
173
|
-
if file.endswith(".pkl"):
|
|
174
|
-
os.remove(os.path.join(self.project_path, file))
|
|
175
|
-
|
|
176
173
|
# Pickle the function
|
|
177
174
|
with open(os.path.join(self.project_path, "func.pkl"), "wb") as f:
|
|
178
175
|
dill.dump(func, f)
|
|
@@ -299,9 +296,7 @@ class RayLauncher:
|
|
|
299
296
|
|
|
300
297
|
# Wait for log file to be created
|
|
301
298
|
current_queue = None
|
|
302
|
-
queue_log_file = os.path.join(
|
|
303
|
-
self.project_path, "{}_queue.log".format(job_name)
|
|
304
|
-
)
|
|
299
|
+
queue_log_file = os.path.join(self.project_path, "queue.log")
|
|
305
300
|
with open(queue_log_file, "w") as f:
|
|
306
301
|
f.write("")
|
|
307
302
|
print(
|
|
@@ -312,6 +307,7 @@ class RayLauncher:
|
|
|
312
307
|
subprocess.Popen(
|
|
313
308
|
["tail", "-f", os.path.join(self.project_path, "{}.log".format(job_name))]
|
|
314
309
|
)
|
|
310
|
+
start_time = time.time()
|
|
315
311
|
while True:
|
|
316
312
|
time.sleep(0.25)
|
|
317
313
|
if os.path.exists(
|
|
@@ -361,6 +357,12 @@ class RayLauncher:
|
|
|
361
357
|
node_list,
|
|
362
358
|
)
|
|
363
359
|
)[1:]
|
|
360
|
+
|
|
361
|
+
# Update the queue log
|
|
362
|
+
if time.time() - start_time > 60:
|
|
363
|
+
start_time = time.time()
|
|
364
|
+
print("Update time: {}".format(time.strftime("%H:%M:%S")))
|
|
365
|
+
|
|
364
366
|
if current_queue is None or current_queue != to_queue:
|
|
365
367
|
current_queue = to_queue
|
|
366
368
|
with open(queue_log_file, "w") as f:
|
|
@@ -373,7 +375,11 @@ class RayLauncher:
|
|
|
373
375
|
text += "\n"
|
|
374
376
|
f.write(text)
|
|
375
377
|
|
|
378
|
+
# Print the queue
|
|
379
|
+
print(text)
|
|
380
|
+
|
|
376
381
|
# Wait for the job to finish while printing the log
|
|
382
|
+
print("Job started! Waiting for the job to finish...")
|
|
377
383
|
log_cursor_position = 0
|
|
378
384
|
job_finished = False
|
|
379
385
|
while not job_finished:
|
|
@@ -407,7 +413,7 @@ class RayLauncher:
|
|
|
407
413
|
if self.server_password is None:
|
|
408
414
|
# Add ssh key
|
|
409
415
|
self.server_password = getpass("Enter your cluster password: ")
|
|
410
|
-
|
|
416
|
+
|
|
411
417
|
ssh_client.connect(
|
|
412
418
|
hostname=self.server_ssh,
|
|
413
419
|
username=self.server_username,
|
|
@@ -440,11 +446,12 @@ class RayLauncher:
|
|
|
440
446
|
# lines = [re.sub(r'bitsandbytes\n', 'bitsandbytes --global-option="--cuda_ext"\n', line) for line in lines]
|
|
441
447
|
lines = [re.sub(r"slurmray\n", "", line) for line in lines]
|
|
442
448
|
# Add slurmray --pre
|
|
443
|
-
lines.append("slurmray --pre\n")
|
|
449
|
+
lines.append("slurmray --pre \n")
|
|
444
450
|
# Solve torch buf (https://github.com/pytorch/pytorch/issues/111469)
|
|
445
451
|
if "torchaudio\n" or "torchvision\n" in lines:
|
|
446
|
-
lines.append(
|
|
447
|
-
|
|
452
|
+
lines.append(
|
|
453
|
+
"torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121\n"
|
|
454
|
+
)
|
|
448
455
|
|
|
449
456
|
with open(f"{self.project_path}/requirements.txt", "w") as file:
|
|
450
457
|
file.writelines(lines)
|
|
@@ -484,13 +491,24 @@ class RayLauncher:
|
|
|
484
491
|
break
|
|
485
492
|
print(line, end="")
|
|
486
493
|
|
|
494
|
+
stdout.channel.recv_exit_status()
|
|
495
|
+
|
|
487
496
|
# Downloading result
|
|
488
497
|
print("Downloading result...")
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
498
|
+
try:
|
|
499
|
+
sftp.get(
|
|
500
|
+
"slurmray-server/.slogs/server/result.pkl",
|
|
501
|
+
os.path.join(self.project_path, "result.pkl"),
|
|
502
|
+
)
|
|
503
|
+
print("Result downloaded!")
|
|
504
|
+
except FileNotFoundError:
|
|
505
|
+
# Check for errors
|
|
506
|
+
stderr_lines = stderr.readlines()
|
|
507
|
+
if stderr_lines:
|
|
508
|
+
print("\nErrors:\n")
|
|
509
|
+
for line in stderr_lines:
|
|
510
|
+
print(line, end="")
|
|
511
|
+
print("An error occured, please check the logs.")
|
|
494
512
|
|
|
495
513
|
def __write_server_script(self):
|
|
496
514
|
"""This funtion will write a script with the given specifications to run slurmray on the cluster"""
|
|
@@ -541,20 +559,24 @@ if __name__ == "__main__":
|
|
|
541
559
|
return result
|
|
542
560
|
|
|
543
561
|
launcher = RayLauncher(
|
|
544
|
-
project_name="example",
|
|
545
|
-
func=example_func,
|
|
546
|
-
args={"x":
|
|
547
|
-
files=[
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
562
|
+
project_name="example", # Name of the project (will create a directory with this name in the current directory)
|
|
563
|
+
func=example_func, # Function to execute
|
|
564
|
+
args={"x": 5}, # Arguments of the function
|
|
565
|
+
files=[
|
|
566
|
+
"slurmray/RayLauncher.py"
|
|
567
|
+
], # List of files to push to the cluster (file path will be recreated on the cluster)
|
|
568
|
+
modules=[], # List of modules to load on the curnagl Cluster (CUDA & CUDNN are automatically added if use_gpu=True)
|
|
569
|
+
node_nbr=1, # Number of nodes to use
|
|
570
|
+
use_gpu=True, # If you need A100 GPU, you can set it to True
|
|
571
|
+
memory=8, # In MegaBytes
|
|
572
|
+
max_running_time=5, # In minutes
|
|
573
|
+
runtime_env={
|
|
574
|
+
"env_vars": {"NCCL_SOCKET_IFNAME": "eno1"}
|
|
575
|
+
}, # Example of environment variable
|
|
576
|
+
server_run=True, # To run the code on the cluster and not locally
|
|
577
|
+
server_ssh="curnagl.dcsr.unil.ch", # Address of the SLURM server
|
|
578
|
+
server_username="hjamet", # Username to connect to the server
|
|
579
|
+
server_password=None, # Will be asked in the terminal
|
|
558
580
|
)
|
|
559
581
|
|
|
560
582
|
result = launcher()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
|
|
3
|
+
echo "Installing slurmray server"
|
|
4
|
+
|
|
5
|
+
# Copy files
|
|
6
|
+
mv -t slurmray-server requirements.txt slurmray_server.py
|
|
7
|
+
mv -t slurmray-server/.slogs/server func.pkl args.pkl
|
|
8
|
+
cd slurmray-server
|
|
9
|
+
|
|
10
|
+
# Load modules
|
|
11
|
+
module load gcc python/3.9.13 cuda cudnn
|
|
12
|
+
|
|
13
|
+
# Check if venv exists
|
|
14
|
+
if [ ! -d ".venv" ]; then
|
|
15
|
+
python3 -m venv .venv
|
|
16
|
+
fi
|
|
17
|
+
source .venv/bin/activate
|
|
18
|
+
|
|
19
|
+
# Install requirements
|
|
20
|
+
## Load all installed packages into a variable
|
|
21
|
+
installed_packages=$(pip3 list --format=freeze)
|
|
22
|
+
## Function to check if a package is installed
|
|
23
|
+
is_package_installed() {
|
|
24
|
+
package=$1
|
|
25
|
+
echo "$installed_packages" | grep -i "^$package==" &> /dev/null
|
|
26
|
+
return $?
|
|
27
|
+
}
|
|
28
|
+
## Read the requirements.txt file line by line
|
|
29
|
+
while IFS= read -r package
|
|
30
|
+
do
|
|
31
|
+
# Check if the line is not empty
|
|
32
|
+
if [ -n "$package" ]; then
|
|
33
|
+
echo "Checking package: $package"
|
|
34
|
+
# Extract the package name without options
|
|
35
|
+
package_name=$(echo "$package" | awk '{print $1}' | cut -d'=' -f1)
|
|
36
|
+
if is_package_installed "$package_name"; then
|
|
37
|
+
echo "The package $package_name is already installed."
|
|
38
|
+
else
|
|
39
|
+
echo "Installing package: $package"
|
|
40
|
+
command="pip3 install $package"
|
|
41
|
+
eval "$command"
|
|
42
|
+
if [ $? -ne 0 ]; then
|
|
43
|
+
echo "Error while installing $package"
|
|
44
|
+
fi
|
|
45
|
+
fi
|
|
46
|
+
fi
|
|
47
|
+
done < "requirements.txt"
|
|
48
|
+
|
|
49
|
+
# Fix torch bug (https://github.com/pytorch/pytorch/issues/111469)
|
|
50
|
+
export LD_LIBRARY_PATH=$HOME/slurmray-server/.venv/lib/python3.9/site-packages/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Run server
|
|
54
|
+
python -u slurmray_server.py
|
|
@@ -15,8 +15,5 @@ if __name__ == "__main__":
|
|
|
15
15
|
server_ssh=None,
|
|
16
16
|
server_username=None,
|
|
17
17
|
)
|
|
18
|
-
|
|
19
|
-
# Remove serialization
|
|
20
|
-
launcher.__serialize_func_and_args = lambda *args, **kwargs : print("No serialization done.")
|
|
21
18
|
|
|
22
|
-
result = launcher()
|
|
19
|
+
result = launcher(serialize=False)
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#!/bin/sh
|
|
2
|
-
|
|
3
|
-
echo "Installing slurmray server"
|
|
4
|
-
|
|
5
|
-
# Copy files
|
|
6
|
-
mv -t slurmray-server requirements.txt slurmray_server.py
|
|
7
|
-
mv -t slurmray-server/.slogs/server func.pkl args.pkl
|
|
8
|
-
cd slurmray-server
|
|
9
|
-
|
|
10
|
-
# Load modules
|
|
11
|
-
module load gcc python/3.9.13 cuda cudnn
|
|
12
|
-
|
|
13
|
-
# Check if venv exists
|
|
14
|
-
if [ ! -d ".venv" ]; then
|
|
15
|
-
python3 -m venv .venv
|
|
16
|
-
fi
|
|
17
|
-
source .venv/bin/activate
|
|
18
|
-
|
|
19
|
-
# Install requirements
|
|
20
|
-
pip3 install -r requirements.txt
|
|
21
|
-
|
|
22
|
-
# Fix torch bug (https://github.com/pytorch/pytorch/issues/111469)
|
|
23
|
-
export LD_LIBRARY_PATH=$HOME/slurmray-server/.venv/lib/python3.9/site-packages/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# Run server
|
|
27
|
-
python -u slurmray_server.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|