PyPI - slurmray - Versions diffs - 3.5.9__tar.gz → 3.6.1__tar.gz - Mend

slurmray 3.5.9tar.gz → 3.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of slurmray might be problematic. Click here for more details.

Files changed (11) hide show

{slurmray-3.5.9 → slurmray-3.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: slurmray
-Version: 3.5.9
+Version: 3.6.1
 Summary: SlurmRay is a module for effortlessly distributing tasks on a Slurm cluster using the Ray library.
 Home-page: https://henri-jamet.vercel.app/
 License: Apache License
@@ -23,7 +23,7 @@ Description-Content-Type: text/markdown
 # SLURM_RAY
-👉[Full documentation](https://henri-jamet.vercel.app/cards/documentation/slurm-ray/slurm-ray/)
+👉[Full documentation](https://www.henri-jamet.com/docs/slurmray/slurm-ray/)
 ## Description

{slurmray-3.5.9 → slurmray-3.6.1}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 # SLURM_RAY
-👉[Full documentation](https://henri-jamet.vercel.app/cards/documentation/slurm-ray/slurm-ray/)
+👉[Full documentation](https://www.henri-jamet.com/docs/slurmray/slurm-ray/)
 ## Description

{slurmray-3.5.9 → slurmray-3.6.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "slurmray"
-version = "3.5.9"
+version = "3.6.1"
 description = "SlurmRay is a module for effortlessly distributing tasks on a Slurm cluster using the Ray library. "
 authors = ["Henri Jamet <henri.jamet@unil.ch>"]
 license = "Apache License"

{slurmray-3.5.9 → slurmray-3.6.1}/slurmray/RayLauncher.py RENAMED Viewed

@@ -87,17 +87,19 @@ class RayLauncher:
             self.__write_python_script()
             self.script_file, self.job_name = self.__write_slurm_script()
-    def __call__(self, cancel_old_jobs: bool = True) -> Any:
+    def __call__(self, cancel_old_jobs: bool = True, serialize: bool = True) -> Any:
         """Launch the job and return the result
         Args:
             cancel_old_jobs (bool, optional): Cancel the old jobs. Defaults to True.
+            serialize (bool, optional): Serialize the function and the arguments. This should be set to False if the function is automatically called by the server. Defaults to True.
         Returns:
             Any: Result of the function
         """
         # Sereialize function and arguments
-        self.__serialize_func_and_args(self.func, self.args)
+        if serialize:
+            self.__serialize_func_and_args(self.func, self.args)
         if self.cluster:
             print("Cluster detected, running on cluster...")
@@ -168,11 +170,6 @@ class RayLauncher:
         """
         print("Serializing function and arguments...")
-        # Remove the old python script
-        for file in os.listdir(self.project_path):
-            if file.endswith(".pkl"):
-                os.remove(os.path.join(self.project_path, file))
         # Pickle the function
         with open(os.path.join(self.project_path, "func.pkl"), "wb") as f:
             dill.dump(func, f)
@@ -299,9 +296,7 @@ class RayLauncher:
         # Wait for log file to be created
         current_queue = None
-        queue_log_file = os.path.join(
-            self.project_path, "{}_queue.log".format(job_name)
-        )
+        queue_log_file = os.path.join(self.project_path, "queue.log")
         with open(queue_log_file, "w") as f:
             f.write("")
         print(
@@ -312,6 +307,7 @@ class RayLauncher:
         subprocess.Popen(
             ["tail", "-f", os.path.join(self.project_path, "{}.log".format(job_name))]
         )
+        start_time = time.time()
         while True:
             time.sleep(0.25)
             if os.path.exists(
@@ -361,6 +357,12 @@ class RayLauncher:
                         node_list,
                     )
                 )[1:]
+                # Update the queue log
+                if time.time() - start_time > 60:
+                    start_time = time.time()
+                    print("Update time: {}".format(time.strftime("%H:%M:%S")))
                 if current_queue is None or current_queue != to_queue:
                     current_queue = to_queue
                     with open(queue_log_file, "w") as f:
@@ -373,7 +375,11 @@ class RayLauncher:
                         text += "\n"
                         f.write(text)
+                        # Print the queue
+                        print(text)
         # Wait for the job to finish while printing the log
+        print("Job started! Waiting for the job to finish...")
         log_cursor_position = 0
         job_finished = False
         while not job_finished:
@@ -407,7 +413,7 @@ class RayLauncher:
                 if self.server_password is None:
                     # Add ssh key
                     self.server_password = getpass("Enter your cluster password: ")
                 ssh_client.connect(
                     hostname=self.server_ssh,
                     username=self.server_username,
@@ -440,11 +446,12 @@ class RayLauncher:
             # lines = [re.sub(r'bitsandbytes\n', 'bitsandbytes --global-option="--cuda_ext"\n', line) for line in lines]
             lines = [re.sub(r"slurmray\n", "", line) for line in lines]
             # Add slurmray --pre
-            lines.append("slurmray --pre\n")
+            lines.append("slurmray --pre \n")
             # Solve torch buf (https://github.com/pytorch/pytorch/issues/111469)
             if "torchaudio\n" or "torchvision\n" in lines:
-                lines.append("torch==2.1.1\n")
-                lines.append("--index-url https://download.pytorch.org/whl/cu121\n")
+                lines.append(
+                    "torch==2.1.1 --index-url https://download.pytorch.org/whl/cu121\n"
+                )
         with open(f"{self.project_path}/requirements.txt", "w") as file:
             file.writelines(lines)
@@ -484,13 +491,24 @@ class RayLauncher:
                 break
             print(line, end="")
+        stdout.channel.recv_exit_status()
         # Downloading result
         print("Downloading result...")
-        sftp.get(
-            "slurmray-server/.slogs/server/result.pkl",
-            os.path.join(self.project_path, "result.pkl"),
-        )
-        print("Result downloaded!")
+        try:
+            sftp.get(
+                "slurmray-server/.slogs/server/result.pkl",
+                os.path.join(self.project_path, "result.pkl"),
+            )
+            print("Result downloaded!")
+        except FileNotFoundError:
+            # Check for errors
+            stderr_lines = stderr.readlines()
+            if stderr_lines:
+                print("\nErrors:\n")
+                for line in stderr_lines:
+                    print(line, end="")
+                print("An error occured, please check the logs.")
     def __write_server_script(self):
         """This funtion will write a script with the given specifications to run slurmray on the cluster"""
@@ -541,20 +559,24 @@ if __name__ == "__main__":
         return result
     launcher = RayLauncher(
-        project_name="example", # Name of the project (will create a directory with this name in the current directory)
-        func=example_func, # Function to execute
-        args={"x": 1}, # Arguments of the function
-        files=["slurmray/RayLauncher.py"], # List of files to push to the cluster (file path will be recreated on the cluster)
-        modules=[], # List of modules to load on the curnagl Cluster (CUDA & CUDNN are automatically added if use_gpu=True)
-        node_nbr=1, # Number of nodes to use
-        use_gpu=True, # If you need A100 GPU, you can set it to True
-        memory=8, # In MegaBytes
-        max_running_time=5, # In minutes
-        runtime_env={"env_vars": {"NCCL_SOCKET_IFNAME": "eno1"}}, # Example of environment variable
-        server_run=True, # To run the code on the cluster and not locally
-        server_ssh="curnagl.dcsr.unil.ch", # Address of the SLURM server
-        server_username="hjamet", # Username to connect to the server
-        server_password=None, # Will be asked in the terminal
+        project_name="example",  # Name of the project (will create a directory with this name in the current directory)
+        func=example_func,  # Function to execute
+        args={"x": 5},  # Arguments of the function
+        files=[
+            "slurmray/RayLauncher.py"
+        ],  # List of files to push to the cluster (file path will be recreated on the cluster)
+        modules=[],  # List of modules to load on the curnagl Cluster (CUDA & CUDNN are automatically added if use_gpu=True)
+        node_nbr=1,  # Number of nodes to use
+        use_gpu=True,  # If you need A100 GPU, you can set it to True
+        memory=8,  # In MegaBytes
+        max_running_time=5,  # In minutes
+        runtime_env={
+            "env_vars": {"NCCL_SOCKET_IFNAME": "eno1"}
+        },  # Example of environment variable
+        server_run=True,  # To run the code on the cluster and not locally
+        server_ssh="curnagl.dcsr.unil.ch",  # Address of the SLURM server
+        server_username="hjamet",  # Username to connect to the server
+        server_password=None,  # Will be asked in the terminal
     )
     result = launcher()

slurmray-3.6.1/slurmray/assets/slurmray_server.sh ADDED Viewed

@@ -0,0 +1,54 @@
+#!/bin/sh
+echo "Installing slurmray server"
+# Copy files
+mv -t slurmray-server requirements.txt slurmray_server.py
+mv -t slurmray-server/.slogs/server func.pkl args.pkl
+cd slurmray-server
+# Load modules
+module load gcc python/3.9.13 cuda cudnn
+# Check if venv exists
+if [ ! -d ".venv" ]; then
+    python3 -m venv .venv
+fi
+source .venv/bin/activate
+# Install requirements
+## Load all installed packages into a variable
+installed_packages=$(pip3 list --format=freeze)
+## Function to check if a package is installed
+is_package_installed() {
+  package=$1
+  echo "$installed_packages" | grep -i "^$package==" &> /dev/null
+  return $?
+}
+## Read the requirements.txt file line by line
+while IFS= read -r package
+do
+  # Check if the line is not empty
+  if [ -n "$package" ]; then
+    echo "Checking package: $package"
+    # Extract the package name without options
+    package_name=$(echo "$package" | awk '{print $1}' | cut -d'=' -f1)
+    if is_package_installed "$package_name"; then
+      echo "The package $package_name is already installed."
+    else
+      echo "Installing package: $package"
+      command="pip3 install $package"
+      eval "$command"
+      if [ $? -ne 0 ]; then
+        echo "Error while installing $package"
+      fi
+    fi
+  fi
+done < "requirements.txt"
+# Fix torch bug (https://github.com/pytorch/pytorch/issues/111469)
+export LD_LIBRARY_PATH=$HOME/slurmray-server/.venv/lib/python3.9/site-packages/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
+# Run server
+python -u slurmray_server.py

{slurmray-3.5.9 → slurmray-3.6.1}/slurmray/assets/slurmray_server_template.py RENAMED Viewed

@@ -15,8 +15,5 @@ if __name__ == "__main__":
         server_ssh=None,
         server_username=None,
     )
-    # Remove serialization
-    launcher.__serialize_func_and_args = lambda *args, **kwargs : print("No serialization done.")
-    result = launcher()
+    result = launcher(serialize=False)

slurmray-3.5.9/slurmray/assets/slurmray_server.sh DELETED Viewed

@@ -1,27 +0,0 @@
-#!/bin/sh
-echo "Installing slurmray server"
-# Copy files
-mv -t slurmray-server requirements.txt slurmray_server.py
-mv -t slurmray-server/.slogs/server func.pkl args.pkl
-cd slurmray-server
-# Load modules
-module load gcc python/3.9.13 cuda cudnn
-# Check if venv exists
-if [ ! -d ".venv" ]; then
-    python3 -m venv .venv
-fi
-source .venv/bin/activate
-# Install requirements
-pip3 install -r requirements.txt
-# Fix torch bug (https://github.com/pytorch/pytorch/issues/111469)
-export LD_LIBRARY_PATH=$HOME/slurmray-server/.venv/lib/python3.9/site-packages/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
-# Run server
-python -u slurmray_server.py

{slurmray-3.5.9 → slurmray-3.6.1}/LICENSE RENAMED Viewed

File without changes

{slurmray-3.5.9 → slurmray-3.6.1}/slurmray/__init__.py RENAMED Viewed

File without changes

{slurmray-3.5.9 → slurmray-3.6.1}/slurmray/assets/sbatch_template.sh RENAMED Viewed

File without changes

{slurmray-3.5.9 → slurmray-3.6.1}/slurmray/assets/spython_template.py RENAMED Viewed

File without changes

slurmray 3.5.9__tar.gz → 3.6.1__tar.gz

Potentially problematic release.

slurmray 3.5.9tar.gz → 3.6.1tar.gz