slurmray 6.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of slurmray might be problematic. Click here for more details.

@@ -0,0 +1,856 @@
1
+ import os
2
+ import sys
3
+ import time
4
+ import re
5
+ import paramiko
6
+ import subprocess
7
+ import dill
8
+ from typing import Any
9
+
10
+ from slurmray.backend.remote import RemoteMixin
11
+ from slurmray.utils import SSHTunnel, DependencyManager
12
+
13
+
14
+ class DesiBackend(RemoteMixin):
15
+ """Backend for Desi server (ISIPOL09) execution"""
16
+
17
+ # Constants for Desi environment
18
+ SERVER_BASE_DIR = "/home/users/{username}/slurmray-server" # Need to check where to write on Desi. assuming home.
19
+ PYTHON_CMD = "/usr/bin/python3" # To be verified
20
+
21
+ def __init__(self, launcher):
22
+ super().__init__(launcher)
23
+ self.tunnel = None
24
+
25
+ def run(self, cancel_old_jobs: bool = True, wait: bool = True) -> Any:
26
+ """Run the job on Desi"""
27
+ self.logger.info("🔌 Connecting to Desi server...")
28
+ self._connect()
29
+ self.logger.info("✅ Connected successfully")
30
+
31
+ # Setup pyenv Python version if available
32
+ self.pyenv_python_cmd = None
33
+ if hasattr(self.launcher, "local_python_version"):
34
+ self.pyenv_python_cmd = self._setup_pyenv_python(
35
+ self.ssh_client, self.launcher.local_python_version
36
+ )
37
+
38
+ # Check Python version compatibility (with pyenv if available)
39
+ is_compatible = self._check_python_version_compatibility(
40
+ self.ssh_client, self.pyenv_python_cmd
41
+ )
42
+ self.python_version_compatible = is_compatible
43
+
44
+ sftp = self.ssh_client.open_sftp()
45
+
46
+ # Base directory on server (organized by project name)
47
+ base_dir = f"/home/{self.launcher.server_username}/slurmray-server/{self.launcher.project_name}"
48
+
49
+ # Generate requirements first to check venv hash
50
+ self._generate_requirements()
51
+
52
+ # Add slurmray (unpinned for now to match legacy behavior, but could be pinned)
53
+ # Check if slurmray is already in requirements.txt to avoid duplicates
54
+ req_file = f"{self.launcher.project_path}/requirements.txt"
55
+ with open(req_file, "r") as f:
56
+ content = f.read()
57
+ if "slurmray" not in content.lower():
58
+ with open(req_file, "a") as f:
59
+ f.write("slurmray\n")
60
+
61
+ # Check if venv can be reused based on requirements hash
62
+ dep_manager = DependencyManager(self.launcher.project_path, self.logger)
63
+ req_file = os.path.join(self.launcher.project_path, "requirements.txt")
64
+
65
+ should_recreate_venv = True
66
+ if self.launcher.force_reinstall_venv:
67
+ # Force recreation: remove venv if it exists
68
+ self.logger.info("🔄 Recreating virtual environment...")
69
+ self.ssh_client.exec_command(f"rm -rf {base_dir}/venv")
70
+ should_recreate_venv = True
71
+ elif os.path.exists(req_file):
72
+ with open(req_file, "r") as f:
73
+ req_lines = f.readlines()
74
+ # Check remote hash (if venv exists on remote)
75
+ remote_hash_file = f"{base_dir}/.slogs/venv_hash.txt"
76
+ stdin, stdout, stderr = self.ssh_client.exec_command(
77
+ f"test -f {remote_hash_file} && cat {remote_hash_file} || echo ''"
78
+ )
79
+ remote_hash = stdout.read().decode("utf-8").strip()
80
+ current_hash = dep_manager.compute_requirements_hash(req_lines)
81
+
82
+ if remote_hash and remote_hash == current_hash:
83
+ # Hash matches, check if venv exists
84
+ stdin, stdout, stderr = self.ssh_client.exec_command(
85
+ f"test -d {base_dir}/venv && echo exists || echo missing"
86
+ )
87
+ venv_exists = stdout.read().decode("utf-8").strip() == "exists"
88
+ if venv_exists:
89
+ should_recreate_venv = False
90
+
91
+ # Smart cleanup: preserve venv if hash matches
92
+ if should_recreate_venv:
93
+ # Clean up everything including venv
94
+ self.ssh_client.exec_command(f"mkdir -p {base_dir} && rm -rf {base_dir}/*")
95
+ # Create flag file to force venv recreation in script
96
+ if self.launcher.force_reinstall_venv:
97
+ self.ssh_client.exec_command(f"touch {base_dir}/.force_reinstall")
98
+ else:
99
+ # Clean up everything except venv and cache
100
+ self.ssh_client.exec_command(
101
+ f"mkdir -p {base_dir} && find {base_dir} -mindepth 1 ! -name 'venv' ! -path '{base_dir}/venv/*' ! -name '.slogs' ! -path '{base_dir}/.slogs/*' -delete"
102
+ )
103
+ # Remove flag file if it exists
104
+ self.ssh_client.exec_command(f"rm -f {base_dir}/.force_reinstall")
105
+
106
+ # Generate Python script (spython.py) that will run on Desi
107
+ # This script uses RayLauncher in LOCAL mode (but on the remote machine)
108
+ # We need to adapt spython.py generation to NOT look for sbatch/slurm
109
+ self._write_python_script(base_dir)
110
+
111
+ # Optimize requirements
112
+ venv_cmd = (
113
+ f"source {base_dir}/venv/bin/activate &&"
114
+ if not should_recreate_venv
115
+ else ""
116
+ )
117
+ req_file_to_push = self._optimize_requirements(self.ssh_client, venv_cmd)
118
+
119
+ # Push files
120
+ files_to_push = [
121
+ f
122
+ for f in os.listdir(self.launcher.project_path)
123
+ if (f.endswith(".py") or f.endswith(".pkl") or f.endswith(".txt"))
124
+ and f != "requirements.txt"
125
+ ]
126
+
127
+ # Fail-fast: ensure func_name.txt is present and will be uploaded
128
+ func_name_txt = "func_name.txt"
129
+ func_name_path = os.path.join(self.launcher.project_path, func_name_txt)
130
+ if not os.path.exists(func_name_path):
131
+ error_msg = f"❌ ERROR: func_name.txt not found locally at {func_name_path}. Cannot proceed without function name."
132
+ self.logger.error(error_msg)
133
+ raise FileNotFoundError(error_msg)
134
+
135
+ if func_name_txt not in files_to_push:
136
+ files_to_push.append(func_name_txt)
137
+
138
+ if files_to_push:
139
+ self.logger.info(f"📤 Uploading {len(files_to_push)} file(s) to server...")
140
+ for file in files_to_push:
141
+ sftp.put(
142
+ os.path.join(self.launcher.project_path, file), f"{base_dir}/{file}"
143
+ )
144
+
145
+ # Verify func_name.txt was uploaded successfully (fail-fast)
146
+ stdin, stdout, stderr = self.ssh_client.exec_command(
147
+ f"test -f {base_dir}/{func_name_txt} && echo 'exists' || echo 'missing'"
148
+ )
149
+ stdout.channel.recv_exit_status()
150
+ if "missing" in stdout.read().decode("utf-8").strip():
151
+ error_msg = f"❌ ERROR: func_name.txt upload failed. File not found on server at {base_dir}/{func_name_txt}"
152
+ self.logger.error(error_msg)
153
+ raise FileNotFoundError(error_msg)
154
+
155
+ # Push optimized requirements as requirements.txt
156
+ if os.path.exists(req_file_to_push):
157
+ sftp.put(req_file_to_push, f"{base_dir}/requirements.txt")
158
+ else:
159
+ # Ensure no stale requirements.txt exists on remote if local one is missing
160
+ try:
161
+ sftp.remove(f"{base_dir}/requirements.txt")
162
+ except IOError:
163
+ pass # File didn't exist
164
+
165
+ # Store venv hash on remote for future checks
166
+ if os.path.exists(req_file):
167
+ with open(req_file, "r") as f:
168
+ req_lines = f.readlines()
169
+ current_hash = dep_manager.compute_requirements_hash(req_lines)
170
+ # Ensure .slogs directory exists on remote
171
+ self.ssh_client.exec_command(f"mkdir -p {base_dir}/.slogs")
172
+ stdin, stdout, stderr = self.ssh_client.exec_command(
173
+ f"echo '{current_hash}' > {base_dir}/.slogs/venv_hash.txt"
174
+ )
175
+ stdout.channel.recv_exit_status()
176
+ # Also store locally
177
+ dep_manager.store_venv_hash(current_hash)
178
+
179
+ # Update retention timestamp
180
+ self._update_retention_timestamp(
181
+ self.ssh_client, base_dir, self.launcher.retention_days
182
+ )
183
+
184
+ # Filter valid files
185
+ valid_files = []
186
+ for file in self.launcher.files:
187
+ # Skip invalid paths
188
+ if (
189
+ not file
190
+ or file == "."
191
+ or file == ".."
192
+ or file.startswith("./")
193
+ or file.startswith("../")
194
+ ):
195
+ self.logger.warning(f"Skipping invalid file path: {file}")
196
+ continue
197
+ valid_files.append(file)
198
+
199
+ # Use incremental sync for local files
200
+ if valid_files:
201
+ self._sync_local_files_incremental(sftp, base_dir, valid_files)
202
+
203
+ # Create runner script (shell script to setup env and run python)
204
+ runner_script = "run_desi.sh"
205
+ self._write_runner_script(runner_script, base_dir)
206
+ sftp.put(
207
+ os.path.join(self.launcher.project_path, runner_script),
208
+ f"{base_dir}/{runner_script}",
209
+ )
210
+ self.ssh_client.exec_command(f"chmod +x {base_dir}/{runner_script}")
211
+
212
+ # Run the script
213
+ self.logger.info("🚀 Starting job execution...")
214
+
215
+ desi_wrapper_script = "desi_wrapper.py"
216
+ self._write_desi_wrapper(desi_wrapper_script)
217
+ sftp.put(
218
+ os.path.join(self.launcher.project_path, desi_wrapper_script),
219
+ f"{base_dir}/{desi_wrapper_script}",
220
+ )
221
+
222
+ # Determine command based on wait mode
223
+ if wait:
224
+ cmd = f"cd {base_dir} && ./run_desi.sh"
225
+ stdin, stdout, stderr = self.ssh_client.exec_command(cmd, get_pty=True)
226
+
227
+ # Stream output
228
+ ray_started = False
229
+
230
+ # Read output line by line
231
+ while True:
232
+ line = stdout.readline()
233
+ if not line:
234
+ break
235
+
236
+ # Filter out noisy messages and format nicely
237
+ line_stripped = line.strip()
238
+ if not line_stripped:
239
+ continue
240
+
241
+ # Skip pkill errors (already handled silently)
242
+ if "pkill:" in line_stripped:
243
+ continue
244
+
245
+ # Detect Ray startup
246
+ if (
247
+ "Started a local Ray instance" in line_stripped
248
+ or "View the dashboard at" in line_stripped
249
+ ) and not ray_started:
250
+ ray_started = True
251
+ # Extract dashboard URL if present
252
+ if "http://" in line_stripped:
253
+ # Extract URL from line
254
+ import re
255
+
256
+ url_match = re.search(r"http://[^\s]+", line_stripped)
257
+ if url_match:
258
+ dashboard_url = url_match.group(0)
259
+ self.logger.info(f"📊 Ray dashboard started at {dashboard_url}")
260
+
261
+ # Start SSH Tunnel
262
+ if not self.tunnel:
263
+ try:
264
+ self.tunnel = SSHTunnel(
265
+ ssh_host=self.launcher.server_ssh,
266
+ ssh_username=self.launcher.server_username,
267
+ ssh_password=self.launcher.server_password,
268
+ remote_host="127.0.0.1",
269
+ local_port=8888,
270
+ remote_port=8265,
271
+ logger=self.logger,
272
+ )
273
+ self.tunnel.__enter__()
274
+ self.logger.info(
275
+ "🌐 Dashboard accessible locally at http://localhost:8888"
276
+ )
277
+ except Exception as e:
278
+ self.logger.warning(f"⚠️ Could not establish SSH tunnel: {e}")
279
+ self.tunnel = None
280
+ continue
281
+
282
+ # Print all output (user's print statements and important messages)
283
+ # Filter out only very noisy system messages
284
+ if not any(noise in line_stripped for noise in ["pkill:", "WARNING:"]):
285
+ # Always print user output
286
+ print(line, end="", flush=True)
287
+
288
+ # Log important system messages with emojis
289
+ if (
290
+ "Error" in line_stripped
291
+ or "Traceback" in line_stripped
292
+ or "Exception" in line_stripped
293
+ ):
294
+ self.logger.error(f"❌ {line_stripped}")
295
+ elif "Lock acquired" in line_stripped:
296
+ self.logger.info(f"🔒 {line_stripped}")
297
+ elif "Starting Payload" in line_stripped:
298
+ self.logger.info(f"🚀 {line_stripped}")
299
+ elif "Loaded function" in line_stripped:
300
+ self.logger.info(f"📦 {line_stripped}")
301
+ elif "Job started" in line_stripped or "Sleeping" in line_stripped:
302
+ self.logger.info(f"▶️ {line_stripped}")
303
+ elif "Result written" in line_stripped:
304
+ self.logger.info(f"💾 {line_stripped}")
305
+ elif (
306
+ "Releasing lock" in line_stripped
307
+ or "Lock released" in line_stripped
308
+ ):
309
+ self.logger.info(f"🔓 {line_stripped}")
310
+
311
+ # Read any remaining stderr
312
+ stderr_output = stderr.read().decode("utf-8")
313
+ if stderr_output.strip():
314
+ self.logger.error(f"Script errors:\n{stderr_output}")
315
+ print(stderr_output, end="")
316
+
317
+ exit_status = stdout.channel.recv_exit_status()
318
+
319
+ # Check if script failed - fail-fast immediately
320
+ if exit_status != 0:
321
+ # Collect error information
322
+ error_msg = f"Job script exited with non-zero status: {exit_status}"
323
+ if stderr_output.strip():
324
+ error_msg += f"\nScript errors:\n{stderr_output}"
325
+
326
+ # Log the error
327
+ self.logger.error(error_msg)
328
+
329
+ # Close tunnel if open
330
+ if self.tunnel:
331
+ try:
332
+ self.tunnel.__exit__(None, None, None)
333
+ except Exception:
334
+ pass
335
+ self.tunnel = None
336
+
337
+ # Raise exception immediately (fail-fast)
338
+ raise RuntimeError(error_msg)
339
+
340
+ # Wait a bit for file system to sync
341
+ # Keep tunnel open during job execution - it will be closed at the end of run()
342
+ time.sleep(2)
343
+
344
+ # Wait for result file to be created on remote (with timeout)
345
+ self.logger.info("⏳ Waiting for job completion...")
346
+ max_wait = 300 # 5 minutes max
347
+ wait_start = time.time()
348
+ result_available = False
349
+
350
+ while time.time() - wait_start < max_wait:
351
+ try:
352
+ # Check if result.pkl exists on remote
353
+ stdin, stdout, stderr = self.ssh_client.exec_command(
354
+ f"test -f {base_dir}/result.pkl && echo exists || echo missing"
355
+ )
356
+ stdout.channel.recv_exit_status() # Wait for command to complete
357
+ output = stdout.read().decode("utf-8").strip()
358
+ if output == "exists":
359
+ result_available = True
360
+ break
361
+ except Exception as e:
362
+ self.logger.debug(f"Error checking for result file: {e}")
363
+
364
+ time.sleep(5)
365
+
366
+ if not result_available:
367
+ self.logger.error("❌ Timeout waiting for result file.")
368
+ raise TimeoutError("Timeout waiting for result file on Desi.")
369
+
370
+ # Download result
371
+ self.logger.info("📥 Downloading result...")
372
+ local_result_path = os.path.join(self.launcher.project_path, "result.pkl")
373
+ sftp.get(f"{base_dir}/result.pkl", local_result_path)
374
+
375
+ # Load result
376
+ with open(local_result_path, "rb") as f:
377
+ result = dill.load(f)
378
+
379
+ self.logger.info("✅ Result received!")
380
+
381
+ # Close tunnel now that job is complete
382
+ if self.tunnel:
383
+ self.tunnel.__exit__(None, None, None)
384
+ self.tunnel = None
385
+
386
+ else:
387
+ # Async mode: Run with nohup and redirect to log file
388
+ log_file = "desi.log"
389
+ cmd = f"cd {base_dir} && nohup ./run_desi.sh > {log_file} 2>&1 & echo $!"
390
+ stdin, stdout, stderr = self.ssh_client.exec_command(cmd)
391
+ pid = stdout.read().decode("utf-8").strip()
392
+ self.logger.info(f"Async mode: Job started with PID {pid}. Log file: {base_dir}/{log_file}")
393
+
394
+ return pid
395
+
396
+ def _cleanup_local_temp_files(self):
397
+ """Clean up local temporary files after successful execution"""
398
+ temp_files = [
399
+ "func_source.py",
400
+ "func_name.txt",
401
+ "func.pkl",
402
+ "args.pkl",
403
+ "result.pkl",
404
+ "spython.py",
405
+ "run_desi.sh",
406
+ "desi_wrapper.py",
407
+ "requirements_to_install.txt",
408
+ ]
409
+
410
+ for temp_file in temp_files:
411
+ file_path = os.path.join(self.launcher.project_path, temp_file)
412
+ if os.path.exists(file_path):
413
+ os.remove(file_path)
414
+ self.logger.debug(f"Removed temporary file: {temp_file}")
415
+
416
+ def cancel(self, job_id: str):
417
+ """Cancel job on Desi"""
418
+ self.logger.info(f"Canceling Desi job {job_id}...")
419
+ self._connect()
420
+ try:
421
+ # Try to kill the process and process group
422
+ # job_id is PID from nohup
423
+ # Use negative PID to kill process group
424
+ self.ssh_client.exec_command(f"kill -TERM -{job_id}")
425
+ self.logger.info(f"Sent kill signal to process group {job_id}")
426
+
427
+ # Also kill the specific PID just in case
428
+ self.ssh_client.exec_command(f"kill -9 {job_id}")
429
+ except Exception as e:
430
+ self.logger.warning(f"Failed to cancel Desi job: {e}")
431
+
432
+ def get_result(self, job_id: str) -> Any:
433
+ """Get result for Desi execution"""
434
+ self._connect()
435
+ base_dir = f"/home/{self.launcher.server_username}/slurmray_desi/{self.launcher.project_name}"
436
+ local_path = os.path.join(self.launcher.project_path, "result.pkl")
437
+
438
+ try:
439
+ sftp = self.ssh_client.open_sftp()
440
+ sftp.stat(f"{base_dir}/result.pkl")
441
+ sftp.get(f"{base_dir}/result.pkl", local_path)
442
+ with open(local_path, "rb") as f:
443
+ return dill.load(f)
444
+ except Exception:
445
+ return None
446
+
447
+ def get_logs(self, job_id: str) -> Any:
448
+ """Get logs for Desi execution"""
449
+ self._connect()
450
+ base_dir = f"/home/{self.launcher.server_username}/slurmray_desi/{self.launcher.project_name}"
451
+ log_file = "desi.log" # Assumed from async execution
452
+ remote_log = f"{base_dir}/{log_file}"
453
+
454
+ try:
455
+ stdin, stdout, stderr = self.ssh_client.exec_command(f"cat {remote_log}")
456
+ for line in stdout:
457
+ yield line
458
+ except Exception as e:
459
+ yield f"Error reading remote log: {e}"
460
+
461
+ def _write_python_script(self, base_dir):
462
+ """Write the python script (spython.py) that will be executed by the job"""
463
+ self.logger.info("Writing python script...")
464
+
465
+ # Remove the old python script
466
+ for file in os.listdir(self.launcher.project_path):
467
+ if file.endswith(".py") and "spython" in file:
468
+ os.remove(os.path.join(self.launcher.project_path, file))
469
+
470
+ # Write the python script
471
+ with open(
472
+ os.path.join(self.launcher.module_path, "assets", "spython_template.py"),
473
+ "r",
474
+ ) as f:
475
+ text = f.read()
476
+
477
+ text = text.replace(
478
+ "{{PROJECT_PATH}}", f'"{base_dir}"'
479
+ ) # On remote, we use absolute path
480
+
481
+ # Desi is a single machine (or we treat it as such for now).
482
+ # Ray should run in local mode or with address='auto' but without Slurm specifics.
483
+ # It's basically local execution on a remote machine.
484
+ # Use port 0 to let Ray choose a free port to avoid "address already in use" errors if previous run didn't clean up
485
+ # However, we need to know the port for the tunnel.
486
+ # Better strategy: Try to clean up previous ray instances before starting
487
+ # Add Ray warning suppression to runtime_env if not already present
488
+ runtime_env = self.launcher.runtime_env.copy()
489
+ if "env_vars" not in runtime_env:
490
+ runtime_env["env_vars"] = {}
491
+ if "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO" not in runtime_env["env_vars"]:
492
+ runtime_env["env_vars"]["RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO"] = "0"
493
+
494
+ local_mode = f"\n\tinclude_dashboard=True,\n\tdashboard_host='0.0.0.0',\n\tdashboard_port=8265,\nruntime_env = {runtime_env},\n"
495
+
496
+ text = text.replace(
497
+ "{{LOCAL_MODE}}",
498
+ local_mode,
499
+ )
500
+ with open(os.path.join(self.launcher.project_path, "spython.py"), "w") as f:
501
+ f.write(text)
502
+
503
+ def _write_runner_script(self, filename, base_dir):
504
+ """Write bash script to set up env and run wrapper"""
505
+ # Determine Python command
506
+ if self.pyenv_python_cmd:
507
+ # Use pyenv: the command already includes eval and pyenv shell
508
+ python_cmd = self.pyenv_python_cmd.split(" && ")[
509
+ -1
510
+ ] # Extract just "python" from the command
511
+ python3_cmd = python_cmd.replace("python", "python3")
512
+ pyenv_setup = self.pyenv_python_cmd.rsplit(" && ", 1)[
513
+ 0
514
+ ] # Get "eval ... && pyenv shell X.Y.Z"
515
+ use_pyenv = True
516
+ else:
517
+ # Fallback to system Python
518
+ python_cmd = "python"
519
+ python3_cmd = "python3"
520
+ pyenv_setup = ""
521
+ use_pyenv = False
522
+
523
+ content = f"""#!/bin/bash
524
+ # Desi Runner Script
525
+ set -e # Exit immediately if a command exits with a non-zero status
526
+
527
+ # Clean up any previous Ray instances (silently)
528
+ pkill -f ray 2>/dev/null || true
529
+
530
+ # Setup pyenv if available
531
+ """
532
+
533
+ if use_pyenv:
534
+ content += f"""# Using pyenv for Python version management
535
+ export PATH="$HOME/.pyenv/bin:/usr/local/bin:/opt/pyenv/bin:$PATH"
536
+ {pyenv_setup}
537
+ """
538
+ else:
539
+ content += """# pyenv not available, using system Python
540
+ """
541
+
542
+ content += f"""
543
+ # Check for force reinstall flag
544
+ if [ -f ".force_reinstall" ]; then
545
+ echo "🔄 Force reinstall detected: removing existing virtualenv..."
546
+ rm -rf venv
547
+ rm -f .force_reinstall
548
+ fi
549
+
550
+ # Create venv if it doesn't exist
551
+ if [ ! -d "venv" ]; then
552
+ echo "📦 Creating virtual environment..."
553
+ """
554
+
555
+ if use_pyenv:
556
+ content += f""" {pyenv_setup} && {python3_cmd} -m venv venv
557
+ """
558
+ else:
559
+ content += f""" {python3_cmd} -m venv venv
560
+ """
561
+
562
+ content += f"""else
563
+ echo "✅ Using existing virtual environment"
564
+ VENV_EXISTED=true
565
+ fi
566
+
567
+ # Activate venv
568
+ source venv/bin/activate
569
+
570
+ # Install dependencies if requirements file exists and is not empty
571
+ if [ -f requirements.txt ]; then
572
+ # Check if requirements.txt is empty (only whitespace)
573
+ if [ -s requirements.txt ]; then
574
+ echo "📥 Installing dependencies from requirements.txt..."
575
+
576
+ # Get installed packages once (fast, single command) - create lookup file
577
+ uv pip list --format=freeze 2>/dev/null | sed 's/==/ /' | awk '{{print $1" "$2}}' > /tmp/installed_packages.txt || touch /tmp/installed_packages.txt
578
+
579
+ # Process requirements: filter duplicates and check what needs installation
580
+ INSTALL_ERRORS=0
581
+ SKIPPED_COUNT=0
582
+ > /tmp/to_install.txt # Clear file
583
+
584
+ while IFS= read -r line || [ -n "$line" ]; do
585
+ # Skip empty lines and comments
586
+ line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
587
+ if [ -z "$line" ] || [ "${{line#"#"}}" != "$line" ]; then
588
+ continue
589
+ fi
590
+
591
+ # Extract package name (remove version specifiers and extras)
592
+ pkg_name=$(echo "$line" | sed 's/[<>=!].*//' | sed 's/\\[.*\\]//' | sed 's/[[:space:]]*//' | tr '[:upper:]' '[:lower:]')
593
+ if [ -z "$pkg_name" ]; then
594
+ continue
595
+ fi
596
+
597
+ # Skip duplicates (check if we've already processed this package)
598
+ if grep -qi "^$pkg_name$" /tmp/seen_packages.txt 2>/dev/null; then
599
+ continue
600
+ fi
601
+ echo "$pkg_name" >> /tmp/seen_packages.txt
602
+
603
+ # Extract required version if present
604
+ required_version=""
605
+ if echo "$line" | grep -q "=="; then
606
+ required_version=$(echo "$line" | sed 's/.*==\\([^;]*\\).*/\\1/' | sed 's/[[:space:]]*//')
607
+ fi
608
+
609
+ # Check if package is already installed with correct version
610
+ installed_version=$(grep -i "^$pkg_name " /tmp/installed_packages.txt 2>/dev/null | awk '{{print $2}}' | head -1)
611
+
612
+ if [ -n "$installed_version" ]; then
613
+ if [ -z "$required_version" ] || [ "$installed_version" = "$required_version" ]; then
614
+ echo " ⏭️ $pkg_name==$installed_version (already installed)"
615
+ SKIPPED_COUNT=$((SKIPPED_COUNT + 1))
616
+ continue
617
+ fi
618
+ fi
619
+
620
+ # Package not installed or version mismatch, add to install list
621
+ echo "$line" >> /tmp/to_install.txt
622
+ done < requirements.txt
623
+
624
+ # Install packages that need installation
625
+ if [ -s /tmp/to_install.txt ]; then
626
+ > /tmp/install_errors.txt # Track errors
627
+ while IFS= read -r line; do
628
+ pkg_name=$(echo "$line" | sed 's/[<>=!].*//' | sed 's/\\[.*\\]//' | sed 's/[[:space:]]*//')
629
+ if uv pip install --quiet "$line" >/dev/null 2>&1; then
630
+ echo " ✅ $pkg_name"
631
+ else
632
+ echo " ❌ $pkg_name"
633
+ echo "1" >> /tmp/install_errors.txt
634
+ # Show error details
635
+ uv pip install "$line" 2>&1 | grep -E "(error|Error|ERROR|failed|Failed|FAILED)" | head -3 | sed 's/^/ /' || true
636
+ fi
637
+ done < /tmp/to_install.txt
638
+ INSTALL_ERRORS=$(wc -l < /tmp/install_errors.txt 2>/dev/null | tr -d ' ' || echo "0")
639
+ rm -f /tmp/install_errors.txt
640
+ fi
641
+
642
+ # Count newly installed packages before cleanup
643
+ NEWLY_INSTALLED=0
644
+ if [ -s /tmp/to_install.txt ]; then
645
+ NEWLY_INSTALLED=$(wc -l < /tmp/to_install.txt 2>/dev/null | tr -d ' ' || echo "0")
646
+ fi
647
+
648
+ # Cleanup temp files
649
+ rm -f /tmp/installed_packages.txt /tmp/seen_packages.txt /tmp/to_install.txt
650
+
651
+ if [ $INSTALL_ERRORS -eq 0 ]; then
652
+ if [ $SKIPPED_COUNT -gt 0 ]; then
653
+ echo "✅ All dependencies up to date ($SKIPPED_COUNT already installed, $NEWLY_INSTALLED newly installed)"
654
+ else
655
+ echo "✅ All dependencies installed successfully"
656
+ fi
657
+ else
658
+ echo "❌ Failed to install $INSTALL_ERRORS package(s)" >&2
659
+ exit 1
660
+ fi
661
+ else
662
+ if [ "$VENV_EXISTED" = "true" ]; then
663
+ echo "✅ All dependencies already installed (requirements.txt is empty)"
664
+ else
665
+ echo "⚠️ requirements.txt is empty, skipping dependency installation"
666
+ fi
667
+ fi
668
+ else
669
+ echo "⚠️ No requirements.txt found, skipping dependency installation"
670
+ fi
671
+
672
+ # Add current directory to PYTHONPATH to make 'slurmray' importable
673
+ export PYTHONPATH=$PYTHONPATH:.
674
+
675
+ # Run wrapper (Smart Lock + Script execution)
676
+ echo "🔒 Acquiring Smart Lock and starting job..."
677
+ # Use venv Python (venv is already activated above)
678
+ """
679
+
680
+ # After venv activation, use the venv's python, not the system/pyenv python
681
+ content += """python desi_wrapper.py
682
+ """
683
+
684
+ with open(os.path.join(self.launcher.project_path, filename), "w") as f:
685
+ f.write(content)
686
+
687
+ def _write_desi_wrapper(self, filename):
688
+ """Write python wrapper for Smart Lock with queue management"""
689
+ content = f"""
690
+ import os
691
+ import sys
692
+ import time
693
+ import fcntl
694
+ import subprocess
695
+ import json
696
+
697
+ LOCK_FILE = "/tmp/slurmray_desi.lock"
698
+ QUEUE_FILE = "/tmp/slurmray_desi.queue"
699
+ MAX_RETRIES = 1000
700
+ RETRY_DELAY = 30 # seconds
701
+
702
+ def read_queue():
703
+ '''Read queue file (read-only, no lock needed)'''
704
+ if not os.path.exists(QUEUE_FILE):
705
+ return []
706
+ try:
707
+ with open(QUEUE_FILE, 'r') as f:
708
+ return json.load(f)
709
+ except (json.JSONDecodeError, IOError):
710
+ return []
711
+
712
+ def write_queue(queue_data):
713
+ '''Write queue file with exclusive lock'''
714
+ max_retries = 10
715
+ retry_delay = 0.1
716
+ for attempt in range(max_retries):
717
+ try:
718
+ queue_fd = open(QUEUE_FILE, 'w')
719
+ try:
720
+ fcntl.lockf(queue_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
721
+ json.dump(queue_data, queue_fd, indent=2)
722
+ queue_fd.flush()
723
+ os.fsync(queue_fd.fileno())
724
+ fcntl.lockf(queue_fd, fcntl.LOCK_UN)
725
+ queue_fd.close()
726
+ return True
727
+ except IOError:
728
+ queue_fd.close()
729
+ if attempt < max_retries - 1:
730
+ time.sleep(retry_delay)
731
+ continue
732
+ return False
733
+ except IOError:
734
+ if attempt < max_retries - 1:
735
+ time.sleep(retry_delay)
736
+ continue
737
+ return False
738
+ return False
739
+
740
+ def add_to_queue(pid, user, func_name, project_dir, status):
741
+ '''Add or update entry in queue'''
742
+ queue = read_queue()
743
+ # Remove existing entry with same PID if present
744
+ queue = [entry for entry in queue if entry.get("pid") != str(pid)]
745
+ # Add new entry
746
+ entry = {{
747
+ "pid": str(pid),
748
+ "user": user,
749
+ "func_name": func_name,
750
+ "status": status,
751
+ "timestamp": int(time.time()),
752
+ "project_dir": project_dir
753
+ }}
754
+ queue.append(entry)
755
+ write_queue(queue)
756
+
757
+ def remove_from_queue(pid):
758
+ '''Remove entry from queue'''
759
+ queue = read_queue()
760
+ queue = [entry for entry in queue if entry.get("pid") != str(pid)]
761
+ write_queue(queue)
762
+
763
+ def update_queue_status(pid, status):
764
+ '''Update status of an entry in queue'''
765
+ queue = read_queue()
766
+ for entry in queue:
767
+ if entry.get("pid") == str(pid):
768
+ entry["status"] = status
769
+ write_queue(queue)
770
+ return True
771
+ return False
772
+
773
+ def acquire_lock():
774
+ lock_fd = open(LOCK_FILE, 'w')
775
+ try:
776
+ # Try to acquire non-blocking exclusive lock
777
+ fcntl.lockf(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
778
+ return lock_fd
779
+ except IOError:
780
+ lock_fd.close()
781
+ return None
782
+
783
+ def main():
784
+ lock_fd = None
785
+ retries = 0
786
+ pid = os.getpid()
787
+ user = os.getenv('USER', 'unknown')
788
+ project_dir = os.path.dirname(os.path.abspath(__file__))
789
+
790
+ # Read function name from func_name.txt (fail-fast if missing or empty)
791
+ func_name_path = os.path.join(project_dir, "func_name.txt")
792
+ if not os.path.exists(func_name_path):
793
+ print("❌ ERROR: func_name.txt not found at " + func_name_path)
794
+ sys.exit(1)
795
+
796
+ try:
797
+ with open(func_name_path, 'r') as f:
798
+ func_name = f.read().strip()
799
+ except IOError as e:
800
+ print("❌ ERROR: Failed to read func_name.txt: " + str(e))
801
+ sys.exit(1)
802
+
803
+ if not func_name:
804
+ print("❌ ERROR: func_name.txt is empty at " + func_name_path)
805
+ sys.exit(1)
806
+
807
+ # Add to queue as waiting at startup
808
+ add_to_queue(pid, user, func_name, project_dir, "waiting")
809
+
810
+ print("🔒 Attempting to acquire Smart Lock...")
811
+ while lock_fd is None:
812
+ lock_fd = acquire_lock()
813
+ if lock_fd is None:
814
+ if retries == 0:
815
+ print("⏳ Waiting for resources to become available (another job may be running)...")
816
+ elif retries % 10 == 0: # Log every 10 retries (every 5 minutes)
817
+ print(f"⏳ Still waiting... (attempt {{retries}}/{{MAX_RETRIES}})")
818
+ time.sleep(RETRY_DELAY)
819
+ retries += 1
820
+ if retries > MAX_RETRIES:
821
+ print(f"❌ Timeout: Could not acquire lock after {{MAX_RETRIES}} attempts ({{MAX_RETRIES * RETRY_DELAY / 60:.1f}} minutes)")
822
+ # Remove from queue before exiting
823
+ remove_from_queue(pid)
824
+ sys.exit(1)
825
+
826
+ # Update queue status to running and write function name to lock file
827
+ update_queue_status(pid, "running")
828
+ lock_fd.seek(0)
829
+ lock_fd.truncate()
830
+ lock_fd.write(str(pid) + "\\n" + user + "\\n" + func_name + "\\n" + project_dir + "\\n")
831
+ lock_fd.flush()
832
+
833
+ print("✅ Lock acquired! Starting job execution...")
834
+ # Lock acquired, run payload
835
+ # Use venv Python if available, otherwise fallback to sys.executable
836
+ venv_python = os.path.join(os.path.dirname(__file__), "venv", "bin", "python")
837
+ if os.path.exists(venv_python):
838
+ python_cmd = venv_python
839
+ else:
840
+ python_cmd = sys.executable
841
+ try:
842
+ subprocess.check_call([python_cmd, "spython.py"])
843
+ finally:
844
+ # Remove from queue before releasing lock
845
+ remove_from_queue(pid)
846
+ # Release lock
847
+ print("🔓 Releasing Smart Lock...")
848
+ fcntl.lockf(lock_fd, fcntl.LOCK_UN)
849
+ lock_fd.close()
850
+ print("✅ Lock released")
851
+
852
+ if __name__ == "__main__":
853
+ main()
854
+ """
855
+ with open(os.path.join(self.launcher.project_path, filename), "w") as f:
856
+ f.write(content)