slurmray 3.5.9__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of slurmray might be problematic. Click here for more details.
- slurmray/RayLauncher.py +50 -30
- slurmray/assets/slurmray_server.sh +28 -1
- slurmray/assets/slurmray_server_template.py +1 -4
- {slurmray-3.5.9.dist-info → slurmray-3.6.0.dist-info}/METADATA +2 -2
- slurmray-3.6.0.dist-info/RECORD +10 -0
- slurmray-3.5.9.dist-info/RECORD +0 -10
- {slurmray-3.5.9.dist-info → slurmray-3.6.0.dist-info}/LICENSE +0 -0
- {slurmray-3.5.9.dist-info → slurmray-3.6.0.dist-info}/WHEEL +0 -0
slurmray/RayLauncher.py
CHANGED
|
@@ -168,10 +168,12 @@ class RayLauncher:
|
|
|
168
168
|
"""
|
|
169
169
|
print("Serializing function and arguments...")
|
|
170
170
|
|
|
171
|
-
#
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
171
|
+
# Check if there is already a func.pkl and args.pkl file
|
|
172
|
+
if os.path.exists(
|
|
173
|
+
os.path.join(self.project_path, "func.pkl")
|
|
174
|
+
) and os.path.exists(os.path.join(self.project_path, "args.pkl")):
|
|
175
|
+
print("Function and arguments already serialized.")
|
|
176
|
+
return
|
|
175
177
|
|
|
176
178
|
# Pickle the function
|
|
177
179
|
with open(os.path.join(self.project_path, "func.pkl"), "wb") as f:
|
|
@@ -299,9 +301,7 @@ class RayLauncher:
|
|
|
299
301
|
|
|
300
302
|
# Wait for log file to be created
|
|
301
303
|
current_queue = None
|
|
302
|
-
queue_log_file = os.path.join(
|
|
303
|
-
self.project_path, "{}_queue.log".format(job_name)
|
|
304
|
-
)
|
|
304
|
+
queue_log_file = os.path.join(self.project_path, "queue.log")
|
|
305
305
|
with open(queue_log_file, "w") as f:
|
|
306
306
|
f.write("")
|
|
307
307
|
print(
|
|
@@ -373,7 +373,11 @@ class RayLauncher:
|
|
|
373
373
|
text += "\n"
|
|
374
374
|
f.write(text)
|
|
375
375
|
|
|
376
|
+
# Print the queue
|
|
377
|
+
print(text)
|
|
378
|
+
|
|
376
379
|
# Wait for the job to finish while printing the log
|
|
380
|
+
print("Job started! Waiting for the job to finish...")
|
|
377
381
|
log_cursor_position = 0
|
|
378
382
|
job_finished = False
|
|
379
383
|
while not job_finished:
|
|
@@ -407,7 +411,7 @@ class RayLauncher:
|
|
|
407
411
|
if self.server_password is None:
|
|
408
412
|
# Add ssh key
|
|
409
413
|
self.server_password = getpass("Enter your cluster password: ")
|
|
410
|
-
|
|
414
|
+
|
|
411
415
|
ssh_client.connect(
|
|
412
416
|
hostname=self.server_ssh,
|
|
413
417
|
username=self.server_username,
|
|
@@ -440,11 +444,12 @@ class RayLauncher:
|
|
|
440
444
|
# lines = [re.sub(r'bitsandbytes\n', 'bitsandbytes --global-option="--cuda_ext"\n', line) for line in lines]
|
|
441
445
|
lines = [re.sub(r"slurmray\n", "", line) for line in lines]
|
|
442
446
|
# Add slurmray --pre
|
|
443
|
-
lines.append("slurmray --pre\n")
|
|
447
|
+
lines.append("slurmray --pre \n")
|
|
444
448
|
# Solve torch buf (https://github.com/pytorch/pytorch/issues/111469)
|
|
445
449
|
if "torchaudio\n" or "torchvision\n" in lines:
|
|
446
|
-
lines.append(
|
|
447
|
-
|
|
450
|
+
lines.append(
|
|
451
|
+
"torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121\n"
|
|
452
|
+
)
|
|
448
453
|
|
|
449
454
|
with open(f"{self.project_path}/requirements.txt", "w") as file:
|
|
450
455
|
file.writelines(lines)
|
|
@@ -484,13 +489,24 @@ class RayLauncher:
|
|
|
484
489
|
break
|
|
485
490
|
print(line, end="")
|
|
486
491
|
|
|
492
|
+
stdout.channel.recv_exit_status()
|
|
493
|
+
|
|
487
494
|
# Downloading result
|
|
488
495
|
print("Downloading result...")
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
496
|
+
try:
|
|
497
|
+
sftp.get(
|
|
498
|
+
"slurmray-server/.slogs/server/result.pkl",
|
|
499
|
+
os.path.join(self.project_path, "result.pkl"),
|
|
500
|
+
)
|
|
501
|
+
print("Result downloaded!")
|
|
502
|
+
except FileNotFoundError:
|
|
503
|
+
# Check for errors
|
|
504
|
+
stderr_lines = stderr.readlines()
|
|
505
|
+
if stderr_lines:
|
|
506
|
+
print("\nErrors:\n")
|
|
507
|
+
for line in stderr_lines:
|
|
508
|
+
print(line, end="")
|
|
509
|
+
print("An error occured, please check the logs.")
|
|
494
510
|
|
|
495
511
|
def __write_server_script(self):
|
|
496
512
|
"""This funtion will write a script with the given specifications to run slurmray on the cluster"""
|
|
@@ -541,20 +557,24 @@ if __name__ == "__main__":
|
|
|
541
557
|
return result
|
|
542
558
|
|
|
543
559
|
launcher = RayLauncher(
|
|
544
|
-
project_name="example",
|
|
545
|
-
func=example_func,
|
|
546
|
-
args={"x": 1},
|
|
547
|
-
files=[
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
560
|
+
project_name="example", # Name of the project (will create a directory with this name in the current directory)
|
|
561
|
+
func=example_func, # Function to execute
|
|
562
|
+
args={"x": 1}, # Arguments of the function
|
|
563
|
+
files=[
|
|
564
|
+
"slurmray/RayLauncher.py"
|
|
565
|
+
], # List of files to push to the cluster (file path will be recreated on the cluster)
|
|
566
|
+
modules=[], # List of modules to load on the curnagl Cluster (CUDA & CUDNN are automatically added if use_gpu=True)
|
|
567
|
+
node_nbr=1, # Number of nodes to use
|
|
568
|
+
use_gpu=False, # If you need A100 GPU, you can set it to True
|
|
569
|
+
memory=8, # In MegaBytes
|
|
570
|
+
max_running_time=5, # In minutes
|
|
571
|
+
runtime_env={
|
|
572
|
+
"env_vars": {"NCCL_SOCKET_IFNAME": "eno1"}
|
|
573
|
+
}, # Example of environment variable
|
|
574
|
+
server_run=True, # To run the code on the cluster and not locally
|
|
575
|
+
server_ssh="curnagl.dcsr.unil.ch", # Address of the SLURM server
|
|
576
|
+
server_username="hjamet", # Username to connect to the server
|
|
577
|
+
server_password=None, # Will be asked in the terminal
|
|
558
578
|
)
|
|
559
579
|
|
|
560
580
|
result = launcher()
|
|
@@ -17,7 +17,34 @@ fi
|
|
|
17
17
|
source .venv/bin/activate
|
|
18
18
|
|
|
19
19
|
# Install requirements
|
|
20
|
-
|
|
20
|
+
## Load all installed packages into a variable
|
|
21
|
+
installed_packages=$(pip3 list --format=freeze)
|
|
22
|
+
## Function to check if a package is installed
|
|
23
|
+
is_package_installed() {
|
|
24
|
+
package=$1
|
|
25
|
+
echo "$installed_packages" | grep -i "^$package==" &> /dev/null
|
|
26
|
+
return $?
|
|
27
|
+
}
|
|
28
|
+
## Read the requirements.txt file line by line
|
|
29
|
+
while IFS= read -r package
|
|
30
|
+
do
|
|
31
|
+
# Check if the line is not empty
|
|
32
|
+
if [ -n "$package" ]; then
|
|
33
|
+
echo "Checking package: $package"
|
|
34
|
+
# Extract the package name without options
|
|
35
|
+
package_name=$(echo "$package" | awk '{print $1}' | cut -d'=' -f1)
|
|
36
|
+
if is_package_installed "$package_name"; then
|
|
37
|
+
echo "The package $package_name is already installed."
|
|
38
|
+
else
|
|
39
|
+
echo "Installing package: $package"
|
|
40
|
+
command="pip3 install $package"
|
|
41
|
+
eval "$command"
|
|
42
|
+
if [ $? -ne 0 ]; then
|
|
43
|
+
echo "Error while installing $package"
|
|
44
|
+
fi
|
|
45
|
+
fi
|
|
46
|
+
fi
|
|
47
|
+
done < "requirements.txt"
|
|
21
48
|
|
|
22
49
|
# Fix torch bug (https://github.com/pytorch/pytorch/issues/111469)
|
|
23
50
|
export LD_LIBRARY_PATH=$HOME/slurmray-server/.venv/lib/python3.9/site-packages/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: slurmray
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: SlurmRay is a module for effortlessly distributing tasks on a Slurm cluster using the Ray library.
|
|
5
5
|
Home-page: https://henri-jamet.vercel.app/
|
|
6
6
|
License: Apache License
|
|
@@ -23,7 +23,7 @@ Description-Content-Type: text/markdown
|
|
|
23
23
|
|
|
24
24
|
# SLURM_RAY
|
|
25
25
|
|
|
26
|
-
👉[Full documentation](https://henri-jamet.
|
|
26
|
+
👉[Full documentation](https://www.henri-jamet.com/docs/slurmray/slurm-ray/)
|
|
27
27
|
|
|
28
28
|
## Description
|
|
29
29
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
slurmray/RayLauncher.py,sha256=3pOOMGDENspcfHiEaxWoyLx6Wep5XHItRBuSXJ3cUvI,23279
|
|
2
|
+
slurmray/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
slurmray/assets/sbatch_template.sh,sha256=c-7J4ItzrctDrbF5Znu8p1d_xIgayC9puhjX3nLMzsk,2273
|
|
4
|
+
slurmray/assets/slurmray_server.sh,sha256=-PpX3AitLVfAYjyNqE3BjtDu5uvk11KoiaCUVgmtcEQ,1506
|
|
5
|
+
slurmray/assets/slurmray_server_template.py,sha256=PF4Rl3TrTS8hI0jbCMlOuRmICkL_OucO6R-uKq83kvg,446
|
|
6
|
+
slurmray/assets/spython_template.py,sha256=kRUvNQs9iCcg0wJLmm9LV0TnbUdlenZMYPr_bZPkXLg,597
|
|
7
|
+
slurmray-3.6.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
8
|
+
slurmray-3.6.0.dist-info/METADATA,sha256=VrDOXUE4gBXscmducVlQ2pqzEl2kV0kRO9pBKO-gWQM,3530
|
|
9
|
+
slurmray-3.6.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
10
|
+
slurmray-3.6.0.dist-info/RECORD,,
|
slurmray-3.5.9.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
slurmray/RayLauncher.py,sha256=6ZS8o4CT2ulwTlmW0ahImNEHZy0h-srgNyJoxGew8lg,22617
|
|
2
|
-
slurmray/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
slurmray/assets/sbatch_template.sh,sha256=c-7J4ItzrctDrbF5Znu8p1d_xIgayC9puhjX3nLMzsk,2273
|
|
4
|
-
slurmray/assets/slurmray_server.sh,sha256=BpmyczNtMlsRimbUYU2XSE59YHGSsozo3rqSQcXNubQ,638
|
|
5
|
-
slurmray/assets/slurmray_server_template.py,sha256=xjuF3nwvQONRxQNzXkAsgFkInY80y6ynkHE9zJjw0xk,575
|
|
6
|
-
slurmray/assets/spython_template.py,sha256=kRUvNQs9iCcg0wJLmm9LV0TnbUdlenZMYPr_bZPkXLg,597
|
|
7
|
-
slurmray-3.5.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
8
|
-
slurmray-3.5.9.dist-info/METADATA,sha256=lbPv9a_Mn5nyXayOfrQka6rZG6QmtRzOVOsyjAxdJO4,3549
|
|
9
|
-
slurmray-3.5.9.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
10
|
-
slurmray-3.5.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|