gitarsenal-cli 1.6.5 → 1.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitarsenal-cli",
3
- "version": "1.6.5",
3
+ "version": "1.6.8",
4
4
  "description": "CLI tool for creating Modal sandboxes with GitHub repositories",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Alternative CUDA image options for GitArsenal CLI
4
+ These images are more stable and less likely to cause segmentation faults
5
+ """
6
+
7
+ import modal
8
+
9
+ def get_stable_cuda_image():
10
+ """
11
+ Get a stable CUDA image that's less likely to cause segmentation faults
12
+ """
13
+ return modal.Image.from_registry("nvidia/cuda:11.8.0-runtime-ubuntu22.04", add_python="3.11")
14
+
15
+ def get_lightweight_cuda_image():
16
+ """
17
+ Get a lightweight CUDA image for basic GPU operations
18
+ """
19
+ return modal.Image.from_registry("nvidia/cuda:11.8.0-base-ubuntu22.04", add_python="3.11")
20
+
21
+ def get_latest_stable_cuda_image():
22
+ """
23
+ Get the latest stable CUDA image (12.1 instead of 12.4)
24
+ """
25
+ return modal.Image.from_registry("nvidia/cuda:12.1.0-runtime-ubuntu22.04", add_python="3.11")
26
+
27
+ def get_minimal_cuda_image():
28
+ """
29
+ Get a minimal CUDA image with just the essentials
30
+ """
31
+ return modal.Image.from_registry("nvidia/cuda:11.8.0-minimal-ubuntu22.04", add_python="3.11")
32
+
33
+ def get_custom_cuda_image():
34
+ """
35
+ Create a custom CUDA image with specific optimizations
36
+ """
37
+ return (
38
+ modal.Image.from_registry("nvidia/cuda:11.8.0-runtime-ubuntu22.04", add_python="3.11")
39
+ .apt_install(
40
+ "openssh-server", "sudo", "curl", "wget", "vim", "htop", "git",
41
+ "python3", "python3-pip", "build-essential", "tmux", "screen", "nano",
42
+ "gpg", "ca-certificates", "software-properties-common"
43
+ )
44
+ .pip_install("uv", "modal", "requests", "openai")
45
+ .run_commands(
46
+ # SSH setup
47
+ "mkdir -p /var/run/sshd",
48
+ "mkdir -p /root/.ssh",
49
+ "chmod 700 /root/.ssh",
50
+ "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
51
+ "sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config",
52
+ "sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config",
53
+ "echo 'ClientAliveInterval 60' >> /etc/ssh/sshd_config",
54
+ "echo 'ClientAliveCountMax 3' >> /etc/ssh/sshd_config",
55
+ "ssh-keygen -A",
56
+
57
+ # GPU compatibility
58
+ "echo 'export CUDA_VISIBLE_DEVICES=0' >> /root/.bashrc",
59
+ "echo 'export NVIDIA_VISIBLE_DEVICES=all' >> /root/.bashrc",
60
+ "echo 'export NVIDIA_DRIVER_CAPABILITIES=compute,utility' >> /root/.bashrc",
61
+
62
+ # Bash prompt
63
+ "echo 'export PS1=\"\\[\\e[1;32m\\]modal:\\[\\e[1;34m\\]\\w\\[\\e[0m\\]$ \"' >> /root/.bashrc",
64
+ )
65
+ )
66
+
67
+ # Image selection based on use case
68
+ CUDA_IMAGE_OPTIONS = {
69
+ "stable": get_stable_cuda_image,
70
+ "lightweight": get_lightweight_cuda_image,
71
+ "latest": get_latest_stable_cuda_image,
72
+ "minimal": get_minimal_cuda_image,
73
+ "custom": get_custom_cuda_image,
74
+ "default": lambda: modal.Image.debian_slim() # No CUDA, most stable
75
+ }
76
+
77
+ def get_cuda_image(option="default"):
78
+ """
79
+ Get a CUDA image based on the specified option
80
+
81
+ Args:
82
+ option (str): One of "stable", "lightweight", "latest", "minimal", "custom", "default"
83
+
84
+ Returns:
85
+ modal.Image: The selected CUDA image
86
+ """
87
+ if option not in CUDA_IMAGE_OPTIONS:
88
+ print(f"⚠️ Unknown CUDA image option: {option}. Using default.")
89
+ option = "default"
90
+
91
+ return CUDA_IMAGE_OPTIONS[option]()
92
+
93
+ def test_cuda_image_stability(image_func, name):
94
+ """
95
+ Test the stability of a CUDA image
96
+
97
+ Args:
98
+ image_func: Function that returns a modal.Image
99
+ name (str): Name of the image for logging
100
+
101
+ Returns:
102
+ bool: True if image builds successfully
103
+ """
104
+ try:
105
+ print(f"🧪 Testing {name} CUDA image...")
106
+ image = image_func()
107
+ print(f"✅ {name} image created successfully")
108
+ return True
109
+ except Exception as e:
110
+ print(f"❌ {name} image failed: {e}")
111
+ return False
112
+
113
+ if __name__ == "__main__":
114
+ print("🧪 Testing CUDA image stability...")
115
+
116
+ for name, image_func in CUDA_IMAGE_OPTIONS.items():
117
+ test_cuda_image_stability(image_func, name)
118
+
119
+ print("\n📋 CUDA Image Recommendations:")
120
+ print("• For maximum stability: Use 'default' (no CUDA)")
121
+ print("• For basic GPU operations: Use 'stable' (CUDA 11.8 runtime)")
122
+ print("• For minimal GPU support: Use 'minimal' (CUDA 11.8 minimal)")
123
+ print("• For latest features: Use 'latest' (CUDA 12.1 runtime)")
124
+ print("• For custom setup: Use 'custom' (CUDA 11.8 with SSH)")
@@ -51,14 +51,14 @@ try:
51
51
  print(f"✅ Using tokens from proxy server or defaults")
52
52
  except (ImportError, ValueError) as e:
53
53
  # If the module is not available or tokens are invalid, use hardcoded tokens
54
- # print(f"⚠️ Using default tokens")
54
+ print("")
55
55
 
56
56
  # print("🔧 Fixing Modal token (basic implementation)...")
57
57
 
58
58
  # Set environment variables
59
59
  # os.environ["MODAL_TOKEN_ID"] = TOKEN_ID
60
- os.environ["MODAL_TOKEN_SECRET"] = TOKEN_SECRET
61
- print(f"✅ Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables")
60
+ # os.environ["MODAL_TOKEN_SECRET"] = TOKEN_SECRET
61
+ # print(f"✅ Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables")
62
62
 
63
63
  # Create token file
64
64
  modal_dir = Path.home() / ".modal"
@@ -28,6 +28,7 @@ try:
28
28
  except ImportError:
29
29
  # If the module is not available, use hardcoded tokens
30
30
  # print(f"⚠️ Using default tokens")
31
+ print("")
31
32
 
32
33
  # print("🔧 Advanced Modal Token Fixer")
33
34
 
@@ -118,6 +119,7 @@ try:
118
119
  # print(f"✅ Set tokens via _auth_config")
119
120
  except Exception as e:
120
121
  # print(f"❌ Error setting tokens via _auth_config: {e}")
122
+ print("")
121
123
 
122
124
  try:
123
125
  # Approach 4.2: Set token via set_token() if it exists
@@ -126,7 +128,7 @@ try:
126
128
  # print(f"✅ Set tokens via set_token()")
127
129
  except Exception as e:
128
130
  # print(f"❌ Error setting tokens via set_token(): {e}")
129
-
131
+ print("")
130
132
  try:
131
133
  # Approach 4.3: Set token via Config
132
134
  if hasattr(modal.config, 'Config'):
@@ -135,6 +137,7 @@ try:
135
137
  # print(f"✅ Set tokens via Config")
136
138
  except Exception as e:
137
139
  # print(f"❌ Error setting tokens via Config: {e}")
140
+ print("")
138
141
 
139
142
  # Approach 4.4: Inspect modal.config and try to find token-related attributes
140
143
  # print("\n🔍 Inspecting modal.config for token-related attributes...")
@@ -151,8 +154,10 @@ try:
151
154
  setattr(attr, "token_secret", TOKEN_SECRET)
152
155
  except Exception as e:
153
156
  # print(f" - Error setting tokens in {name}: {e}")
157
+ print("")
154
158
  except Exception as e:
155
159
  # print(f"❌ Error setting tokens in Modal config: {e}")
160
+ print("")
156
161
  except Exception as e:
157
162
  print(f"❌ Error importing Modal: {e}")
158
163
 
@@ -192,6 +197,7 @@ try:
192
197
  setattr(module, func_name, get_token_id)
193
198
  except Exception as e:
194
199
  # print(f" - Error patching {name}.{func_name}: {e}")
200
+ print("")
195
201
  elif "token_secret" in func_name.lower() or "token" in func_name.lower():
196
202
  try:
197
203
  original_func = getattr(module, func_name)
@@ -200,11 +206,12 @@ try:
200
206
  setattr(module, func_name, get_token_secret)
201
207
  except Exception as e:
202
208
  # print(f" - Error patching {name}.{func_name}: {e}")
209
+ print("")
203
210
 
204
211
  # print(f"✅ Monkey-patching completed")
205
212
  except Exception as e:
206
213
  # print(f"❌ Error during monkey-patching: {e}")
207
-
214
+ print("")
208
215
  # Approach 6: Test Modal authentication
209
216
  # print("\n📋 Approach 6: Testing Modal authentication")
210
217
  try:
@@ -227,5 +234,6 @@ try:
227
234
 
228
235
  except Exception as e:
229
236
  # print(f"❌ Error testing Modal authentication: {e}")
237
+ print("")
230
238
 
231
239
  # print("\n✅ Done fixing Modal token. Please try your command again.")
@@ -968,218 +968,6 @@ def generate_random_password(length=16):
968
968
  password = ''.join(secrets.choice(alphabet) for i in range(length))
969
969
  return password
970
970
 
971
- # First, add the standalone ssh_container function at the module level, before the create_modal_ssh_container function
972
-
973
- # Define a module-level ssh container function
974
- ssh_app = modal.App("ssh-container-app")
975
-
976
- @ssh_app.function(
977
- image=modal.Image.debian_slim()
978
- .apt_install(
979
- "openssh-server", "sudo", "curl", "wget", "vim", "htop", "git",
980
- "python3", "python3-pip", "build-essential", "tmux", "screen", "nano",
981
- "gpg", "ca-certificates", "software-properties-common"
982
- )
983
- .pip_install("uv", "modal", "requests", "openai") # Fast Python package installer and Modal
984
- .run_commands(
985
- # Create SSH directory
986
- "mkdir -p /var/run/sshd",
987
- "mkdir -p /root/.ssh",
988
- "chmod 700 /root/.ssh",
989
-
990
- # Configure SSH server
991
- "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
992
- "sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config",
993
- "sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config",
994
-
995
- # SSH keep-alive settings
996
- "echo 'ClientAliveInterval 60' >> /etc/ssh/sshd_config",
997
- "echo 'ClientAliveCountMax 3' >> /etc/ssh/sshd_config",
998
-
999
- # Generate SSH host keys
1000
- "ssh-keygen -A",
1001
-
1002
- # Install Modal CLI
1003
- "pip install modal",
1004
-
1005
- # Set up a nice bash prompt
1006
- "echo 'export PS1=\"\\[\\e[1;32m\\]modal:\\[\\e[1;34m\\]\\w\\[\\e[0m\\]$ \"' >> /root/.bashrc",
1007
- ),
1008
- timeout=3600, # Default 1 hour timeout
1009
- gpu="a10g", # Default GPU - this will be overridden when called
1010
- cpu=2,
1011
- memory=8192,
1012
- serialized=True,
1013
- )
1014
- def ssh_container_function(ssh_password, repo_url=None, repo_name=None, setup_commands=None, openai_api_key=None):
1015
- import subprocess
1016
- import time
1017
- import os
1018
-
1019
- # Set root password
1020
- subprocess.run(["bash", "-c", f"echo 'root:{ssh_password}' | chpasswd"], check=True)
1021
-
1022
- # Start SSH service
1023
- subprocess.run(["service", "ssh", "start"], check=True)
1024
-
1025
- # Setup environment
1026
- os.environ['PS1'] = r'\[\e[1;32m\]modal:\[\e[1;34m\]\w\[\e[0m\]$ '
1027
-
1028
- # Set OpenAI API key if provided
1029
- if openai_api_key:
1030
- os.environ['OPENAI_API_KEY'] = openai_api_key
1031
- print(f"✅ Set OpenAI API key in container environment (length: {len(openai_api_key)})")
1032
- else:
1033
- print("⚠️ No OpenAI API key provided to container")
1034
-
1035
- # Clone repository if provided
1036
- if repo_url:
1037
- repo_name_from_url = repo_name or repo_url.split('/')[-1].replace('.git', '')
1038
- print(f"📥 Cloning repository: {repo_url}")
1039
-
1040
- try:
1041
- subprocess.run(["git", "clone", repo_url], check=True, cwd="/root")
1042
- print(f"✅ Repository cloned successfully: {repo_name_from_url}")
1043
-
1044
- # Change to repository directory
1045
- repo_dir = f"/root/{repo_name_from_url}"
1046
- if os.path.exists(repo_dir):
1047
- os.chdir(repo_dir)
1048
- print(f"📂 Changed to repository directory: {repo_dir}")
1049
-
1050
- except subprocess.CalledProcessError as e:
1051
- print(f"❌ Failed to clone repository: {e}")
1052
-
1053
- # Run setup commands if provided
1054
- if setup_commands:
1055
- print(f"⚙️ Running {len(setup_commands)} setup commands...")
1056
-
1057
- # First, let's check the current directory structure
1058
- print("🔍 Checking current directory structure before running setup commands...")
1059
- try:
1060
- result = subprocess.run("pwd && ls -la", shell=True, check=True,
1061
- capture_output=True, text=True)
1062
- print(f"📂 Current directory: {result.stdout}")
1063
- except subprocess.CalledProcessError as e:
1064
- print(f"⚠️ Could not check directory structure: {e}")
1065
-
1066
- # Define a simple run_command function for SSH container
1067
- def run_command_with_llm_debug(cmd, show_output=True, retry_count=0, max_retries=3):
1068
- """Execute a command with LLM debugging enabled"""
1069
- print(f"🔧 Executing: {cmd}")
1070
- try:
1071
- # Handle special case for source command which doesn't work with subprocess.run
1072
- if cmd.strip().startswith("source ") or " source " in cmd:
1073
- print("⚠️ Detected 'source' command which doesn't work with subprocess.run")
1074
- print("🔄 Converting to bash -c with dot (.) instead of source")
1075
- # Replace source with . (dot) which is the same as source but works in sh
1076
- modified_cmd = cmd.replace("source ", ". ")
1077
- # Wrap in bash -c to ensure it runs in bash
1078
- bash_cmd = f"bash -c '{modified_cmd}'"
1079
- print(f"🔄 Modified command: {bash_cmd}")
1080
- result = subprocess.run(bash_cmd, shell=True, check=True,
1081
- capture_output=True, text=True)
1082
- else:
1083
- result = subprocess.run(cmd, shell=True, check=True,
1084
- capture_output=True, text=True)
1085
-
1086
- if result.stdout and show_output:
1087
- print(f"✅ Output: {result.stdout}")
1088
- return True, result.stdout, ""
1089
- except subprocess.CalledProcessError as e:
1090
- error_output = e.stderr if e.stderr else str(e)
1091
- print(f"❌ Command failed: {e}")
1092
- print(f"❌ Error: {error_output}")
1093
-
1094
- # Call OpenAI for debugging
1095
- print("🔍 Attempting to debug the failed command with OpenAI...")
1096
- try:
1097
- # Get the current directory for context
1098
- current_dir = os.getcwd()
1099
-
1100
- # Call OpenAI for debugging
1101
- print(f"🔍 DEBUG: About to call call_openai_for_debug...")
1102
- print(f"🔍 DEBUG: Command: {cmd}")
1103
- print(f"🔍 DEBUG: Error output length: {len(error_output)}")
1104
- print(f"🔍 DEBUG: Current directory: {current_dir}")
1105
-
1106
- # Get the API key from environment or use the one that was fetched earlier
1107
- api_key = os.environ.get("OPENAI_API_KEY")
1108
- fix_command = call_openai_for_debug(cmd, error_output, api_key=api_key, current_dir=current_dir)
1109
-
1110
- print(f"🔍 DEBUG: call_openai_for_debug returned: {fix_command}")
1111
-
1112
- if fix_command:
1113
- print(f"🔧 OpenAI suggested fix command: {fix_command}")
1114
-
1115
- # Run the fix command
1116
- print(f"🔄 Running suggested fix command: {fix_command}")
1117
- try:
1118
- fix_result = subprocess.run(fix_command, shell=True, check=True,
1119
- capture_output=True, text=True)
1120
- if fix_result.stdout:
1121
- print(f"✅ Fix command output: {fix_result.stdout}")
1122
-
1123
- # Retry the original command
1124
- print(f"🔄 Retrying original command: {cmd}")
1125
- return run_command_with_llm_debug(cmd, show_output, retry_count + 1, max_retries)
1126
- except subprocess.CalledProcessError as fix_e:
1127
- print(f"❌ Fix command also failed: {fix_e}")
1128
- return False, "", error_output
1129
- else:
1130
- print("❌ No fix suggested by OpenAI")
1131
- return False, "", error_output
1132
-
1133
- except Exception as debug_e:
1134
- print(f"❌ LLM debugging failed: {debug_e}")
1135
- return False, "", error_output
1136
-
1137
- for i, cmd in enumerate(setup_commands, 1):
1138
- print(f"📋 Executing command {i}/{len(setup_commands)}: {cmd}")
1139
-
1140
- # Check if this is a cd command and if the directory exists
1141
- if cmd.strip().startswith("cd "):
1142
- cd_parts = cmd.split(None, 1)
1143
- if len(cd_parts) >= 2:
1144
- target_dir = cd_parts[1].strip('"\'')
1145
- print(f"🔍 Checking if directory exists: {target_dir}")
1146
- try:
1147
- check_result = subprocess.run(f"test -d '{target_dir}'", shell=True,
1148
- capture_output=True, text=True)
1149
- if check_result.returncode != 0:
1150
- print(f"⚠️ Directory does not exist: {target_dir}")
1151
- print(f"🔍 Current directory contents:")
1152
- subprocess.run("pwd && ls -la", shell=True, check=False)
1153
-
1154
- # Try to find similar directories
1155
- print(f"🔍 Looking for similar directories...")
1156
- subprocess.run("find . -type d -name '*llama*' -o -name '*nano*' 2>/dev/null | head -10", shell=True, check=False)
1157
- except Exception as e:
1158
- print(f"⚠️ Could not check directory: {e}")
1159
-
1160
- success, stdout, stderr = run_command_with_llm_debug(cmd, show_output=True)
1161
- if not success:
1162
- print(f"⚠️ Command {i} failed, but continuing with remaining commands...")
1163
-
1164
- # If this was a cd command that failed, try to understand the directory structure
1165
- if cmd.strip().startswith("cd ") and "No such file or directory" in stderr:
1166
- print(f"🔍 Analyzing directory structure after failed cd command...")
1167
- subprocess.run("pwd && ls -la && echo '--- Parent directory ---' && ls -la ..", shell=True, check=False)
1168
-
1169
- # Get container info
1170
- print("🔍 Container started successfully!")
1171
- print(f"🆔 Container ID: {os.environ.get('MODAL_TASK_ID', 'unknown')}")
1172
-
1173
- # Keep the container running
1174
- while True:
1175
- time.sleep(30)
1176
- # Check if SSH service is still running
1177
- try:
1178
- subprocess.run(["service", "ssh", "status"], check=True,
1179
- capture_output=True)
1180
- except subprocess.CalledProcessError:
1181
- print("⚠️ SSH service stopped, restarting...")
1182
- subprocess.run(["service", "ssh", "start"], check=True)
1183
971
 
1184
972
  # Now modify the create_modal_ssh_container function to use the standalone ssh_container_function
1185
973
  def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_commands=None,
@@ -1400,7 +1188,7 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1400
1188
  .apt_install(
1401
1189
  "openssh-server", "sudo", "curl", "wget", "vim", "htop", "git",
1402
1190
  "python3", "python3-pip", "build-essential", "tmux", "screen", "nano",
1403
- "gpg", "ca-certificates", "software-properties-common"
1191
+ "gpg", "ca-certificates", "software-properties-common", "nvtop"
1404
1192
  )
1405
1193
  .pip_install("uv", "modal", "requests", "openai") # Fast Python package installer and Modal
1406
1194
  .run_commands(
@@ -1409,6 +1197,9 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1409
1197
  "mkdir -p /root/.ssh",
1410
1198
  "chmod 700 /root/.ssh",
1411
1199
 
1200
+ # Generate SSH host keys
1201
+ "ssh-keygen -A",
1202
+
1412
1203
  # Configure SSH server
1413
1204
  "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
1414
1205
  "sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config",
@@ -1418,8 +1209,9 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1418
1209
  "echo 'ClientAliveInterval 60' >> /etc/ssh/sshd_config",
1419
1210
  "echo 'ClientAliveCountMax 3' >> /etc/ssh/sshd_config",
1420
1211
 
1421
- # Generate SSH host keys
1422
- "ssh-keygen -A",
1212
+ # Set up CUDA environment
1213
+ "echo 'export PATH=/usr/local/cuda/bin:$PATH' >> /root/.bashrc",
1214
+ "echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> /root/.bashrc",
1423
1215
 
1424
1216
  # Set up a nice bash prompt
1425
1217
  "echo 'export PS1=\"\\[\\e[1;32m\\]modal:\\[\\e[1;34m\\]\\w\\[\\e[0m\\]$ \"' >> /root/.bashrc",
@@ -1435,7 +1227,7 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1435
1227
  if volume:
1436
1228
  volumes_config[volume_mount_path] = volume
1437
1229
 
1438
- # Define the SSH container function
1230
+ # Define the SSH container function - simplified like the example
1439
1231
  @app.function(
1440
1232
  image=ssh_image,
1441
1233
  timeout=timeout_minutes * 60, # Convert to seconds
@@ -1445,7 +1237,7 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1445
1237
  serialized=True,
1446
1238
  volumes=volumes_config if volumes_config else None,
1447
1239
  )
1448
- def ssh_container_function(ssh_password=None, repo_url=None, repo_name=None, setup_commands=None, openai_api_key=None):
1240
+ def start_ssh():
1449
1241
  """Start SSH container with password authentication and optional setup."""
1450
1242
  import subprocess
1451
1243
  import time
@@ -1455,14 +1247,25 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1455
1247
  subprocess.run(["bash", "-c", f"echo 'root:{ssh_password}' | chpasswd"], check=True)
1456
1248
 
1457
1249
  # Set OpenAI API key if provided
1250
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
1458
1251
  if openai_api_key:
1459
1252
  os.environ['OPENAI_API_KEY'] = openai_api_key
1460
1253
  print(f"✅ Set OpenAI API key in container environment (length: {len(openai_api_key)})")
1461
1254
  else:
1462
1255
  print("⚠️ No OpenAI API key provided to container")
1463
1256
 
1464
- # Start SSH service
1465
- subprocess.run(["service", "ssh", "start"], check=True)
1257
+ # Start SSH service using Popen (non-blocking) like in the example
1258
+ subprocess.Popen(["/usr/sbin/sshd", "-D"])
1259
+ time.sleep(2) # Give SSH time to start
1260
+
1261
+ # Test CUDA setup
1262
+ try:
1263
+ print("🔧 Testing CUDA setup...")
1264
+ subprocess.run(["nvidia-smi"], check=True)
1265
+ subprocess.run(["nvcc", "--version"], check=True)
1266
+ print("✅ CUDA setup verified")
1267
+ except subprocess.CalledProcessError as e:
1268
+ print(f"⚠️ CUDA test failed: {e}")
1466
1269
 
1467
1270
  # Clone repository if provided
1468
1271
  if repo_url:
@@ -1590,43 +1393,28 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1590
1393
  print(f"🔍 Analyzing directory structure after failed cd command...")
1591
1394
  subprocess.run("pwd && ls -la && echo '--- Parent directory ---' && ls -la ..", shell=True, check=False)
1592
1395
 
1593
- # Create SSH tunnel
1594
- with modal.forward(22, unencrypted=True) as tunnel:
1595
- host, port = tunnel.tcp_socket
1396
+ # Forward SSH port and keep container alive - exactly like the example
1397
+ with modal.forward(port=22, unencrypted=True) as tunnel:
1398
+ hostname, port = tunnel.tcp_socket
1596
1399
 
1597
1400
  print("\n" + "=" * 80)
1598
1401
  print("🎉 SSH CONTAINER IS READY!")
1599
1402
  print("=" * 80)
1600
- print(f"🌐 SSH Host: {host}")
1601
- print(f"🔌 SSH Port: {port}")
1602
- print(f"👤 Username: root")
1603
- print(f"🔐 Password: {ssh_password}")
1604
- print()
1605
- print("🔗 CONNECT USING THIS COMMAND:")
1606
- print(f"ssh -p {port} root@{host}")
1403
+ print(f"SSH: ssh -p {port} root@{hostname}")
1404
+ print(f"Password: {ssh_password}")
1607
1405
  print("=" * 80)
1608
1406
 
1609
- # Keep the container running
1407
+ # Keep alive - simplified like the example
1610
1408
  while True:
1611
- time.sleep(30)
1612
- # Check if SSH service is still running
1613
- try:
1614
- subprocess.run(["service", "ssh", "status"], check=True,
1615
- capture_output=True)
1616
- except subprocess.CalledProcessError:
1617
- print("⚠️ SSH service stopped, restarting...")
1618
- subprocess.run(["service", "ssh", "start"], check=True)
1409
+ time.sleep(60)
1619
1410
 
1620
1411
  # Run the container
1621
1412
  try:
1622
1413
  print("⏳ Starting container... This may take 1-2 minutes...")
1623
1414
 
1624
- # Start the container in a new thread to avoid blocking
1625
- with modal.enable_output():
1626
- with app.run():
1627
- # Get the API key from environment
1628
- api_key = os.environ.get("OPENAI_API_KEY")
1629
- ssh_container_function.remote(ssh_password, repo_url, repo_name, setup_commands, api_key)
1415
+ # Start the container - simplified like the example
1416
+ with app.run():
1417
+ start_ssh.remote()
1630
1418
 
1631
1419
  # Clean up Modal token after container is successfully created
1632
1420
  cleanup_modal_token()
@@ -968,218 +968,6 @@ def generate_random_password(length=16):
968
968
  password = ''.join(secrets.choice(alphabet) for i in range(length))
969
969
  return password
970
970
 
971
- # First, add the standalone ssh_container function at the module level, before the create_modal_ssh_container function
972
-
973
- # Define a module-level ssh container function
974
- ssh_app = modal.App("ssh-container-app")
975
-
976
- @ssh_app.function(
977
- image=modal.Image.debian_slim()
978
- .apt_install(
979
- "openssh-server", "sudo", "curl", "wget", "vim", "htop", "git",
980
- "python3", "python3-pip", "build-essential", "tmux", "screen", "nano",
981
- "gpg", "ca-certificates", "software-properties-common"
982
- )
983
- .pip_install("uv", "modal", "requests", "openai") # Fast Python package installer and Modal
984
- .run_commands(
985
- # Create SSH directory
986
- "mkdir -p /var/run/sshd",
987
- "mkdir -p /root/.ssh",
988
- "chmod 700 /root/.ssh",
989
-
990
- # Configure SSH server
991
- "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
992
- "sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config",
993
- "sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config",
994
-
995
- # SSH keep-alive settings
996
- "echo 'ClientAliveInterval 60' >> /etc/ssh/sshd_config",
997
- "echo 'ClientAliveCountMax 3' >> /etc/ssh/sshd_config",
998
-
999
- # Generate SSH host keys
1000
- "ssh-keygen -A",
1001
-
1002
- # Install Modal CLI
1003
- "pip install modal",
1004
-
1005
- # Set up a nice bash prompt
1006
- "echo 'export PS1=\"\\[\\e[1;32m\\]modal:\\[\\e[1;34m\\]\\w\\[\\e[0m\\]$ \"' >> /root/.bashrc",
1007
- ),
1008
- timeout=3600, # Default 1 hour timeout
1009
- gpu="a10g", # Default GPU - this will be overridden when called
1010
- cpu=2,
1011
- memory=8192,
1012
- serialized=True,
1013
- )
1014
- def ssh_container_function(ssh_password, repo_url=None, repo_name=None, setup_commands=None, openai_api_key=None):
1015
- import subprocess
1016
- import time
1017
- import os
1018
-
1019
- # Set root password
1020
- subprocess.run(["bash", "-c", f"echo 'root:{ssh_password}' | chpasswd"], check=True)
1021
-
1022
- # Start SSH service
1023
- subprocess.run(["service", "ssh", "start"], check=True)
1024
-
1025
- # Setup environment
1026
- os.environ['PS1'] = r'\[\e[1;32m\]modal:\[\e[1;34m\]\w\[\e[0m\]$ '
1027
-
1028
- # Set OpenAI API key if provided
1029
- if openai_api_key:
1030
- os.environ['OPENAI_API_KEY'] = openai_api_key
1031
- print(f"✅ Set OpenAI API key in container environment (length: {len(openai_api_key)})")
1032
- else:
1033
- print("⚠️ No OpenAI API key provided to container")
1034
-
1035
- # Clone repository if provided
1036
- if repo_url:
1037
- repo_name_from_url = repo_name or repo_url.split('/')[-1].replace('.git', '')
1038
- print(f"📥 Cloning repository: {repo_url}")
1039
-
1040
- try:
1041
- subprocess.run(["git", "clone", repo_url], check=True, cwd="/root")
1042
- print(f"✅ Repository cloned successfully: {repo_name_from_url}")
1043
-
1044
- # Change to repository directory
1045
- repo_dir = f"/root/{repo_name_from_url}"
1046
- if os.path.exists(repo_dir):
1047
- os.chdir(repo_dir)
1048
- print(f"📂 Changed to repository directory: {repo_dir}")
1049
-
1050
- except subprocess.CalledProcessError as e:
1051
- print(f"❌ Failed to clone repository: {e}")
1052
-
1053
- # Run setup commands if provided
1054
- if setup_commands:
1055
- print(f"⚙️ Running {len(setup_commands)} setup commands...")
1056
-
1057
- # First, let's check the current directory structure
1058
- print("🔍 Checking current directory structure before running setup commands...")
1059
- try:
1060
- result = subprocess.run("pwd && ls -la", shell=True, check=True,
1061
- capture_output=True, text=True)
1062
- print(f"📂 Current directory: {result.stdout}")
1063
- except subprocess.CalledProcessError as e:
1064
- print(f"⚠️ Could not check directory structure: {e}")
1065
-
1066
- # Define a simple run_command function for SSH container
1067
- def run_command_with_llm_debug(cmd, show_output=True, retry_count=0, max_retries=3):
1068
- """Execute a command with LLM debugging enabled"""
1069
- print(f"🔧 Executing: {cmd}")
1070
- try:
1071
- # Handle special case for source command which doesn't work with subprocess.run
1072
- if cmd.strip().startswith("source ") or " source " in cmd:
1073
- print("⚠️ Detected 'source' command which doesn't work with subprocess.run")
1074
- print("🔄 Converting to bash -c with dot (.) instead of source")
1075
- # Replace source with . (dot) which is the same as source but works in sh
1076
- modified_cmd = cmd.replace("source ", ". ")
1077
- # Wrap in bash -c to ensure it runs in bash
1078
- bash_cmd = f"bash -c '{modified_cmd}'"
1079
- print(f"🔄 Modified command: {bash_cmd}")
1080
- result = subprocess.run(bash_cmd, shell=True, check=True,
1081
- capture_output=True, text=True)
1082
- else:
1083
- result = subprocess.run(cmd, shell=True, check=True,
1084
- capture_output=True, text=True)
1085
-
1086
- if result.stdout and show_output:
1087
- print(f"✅ Output: {result.stdout}")
1088
- return True, result.stdout, ""
1089
- except subprocess.CalledProcessError as e:
1090
- error_output = e.stderr if e.stderr else str(e)
1091
- print(f"❌ Command failed: {e}")
1092
- print(f"❌ Error: {error_output}")
1093
-
1094
- # Call OpenAI for debugging
1095
- print("🔍 Attempting to debug the failed command with OpenAI...")
1096
- try:
1097
- # Get the current directory for context
1098
- current_dir = os.getcwd()
1099
-
1100
- # Call OpenAI for debugging
1101
- print(f"🔍 DEBUG: About to call call_openai_for_debug...")
1102
- print(f"🔍 DEBUG: Command: {cmd}")
1103
- print(f"🔍 DEBUG: Error output length: {len(error_output)}")
1104
- print(f"🔍 DEBUG: Current directory: {current_dir}")
1105
-
1106
- # Get the API key from environment or use the one that was fetched earlier
1107
- api_key = os.environ.get("OPENAI_API_KEY")
1108
- fix_command = call_openai_for_debug(cmd, error_output, api_key=api_key, current_dir=current_dir)
1109
-
1110
- print(f"🔍 DEBUG: call_openai_for_debug returned: {fix_command}")
1111
-
1112
- if fix_command:
1113
- print(f"🔧 OpenAI suggested fix command: {fix_command}")
1114
-
1115
- # Run the fix command
1116
- print(f"🔄 Running suggested fix command: {fix_command}")
1117
- try:
1118
- fix_result = subprocess.run(fix_command, shell=True, check=True,
1119
- capture_output=True, text=True)
1120
- if fix_result.stdout:
1121
- print(f"✅ Fix command output: {fix_result.stdout}")
1122
-
1123
- # Retry the original command
1124
- print(f"🔄 Retrying original command: {cmd}")
1125
- return run_command_with_llm_debug(cmd, show_output, retry_count + 1, max_retries)
1126
- except subprocess.CalledProcessError as fix_e:
1127
- print(f"❌ Fix command also failed: {fix_e}")
1128
- return False, "", error_output
1129
- else:
1130
- print("❌ No fix suggested by OpenAI")
1131
- return False, "", error_output
1132
-
1133
- except Exception as debug_e:
1134
- print(f"❌ LLM debugging failed: {debug_e}")
1135
- return False, "", error_output
1136
-
1137
- for i, cmd in enumerate(setup_commands, 1):
1138
- print(f"📋 Executing command {i}/{len(setup_commands)}: {cmd}")
1139
-
1140
- # Check if this is a cd command and if the directory exists
1141
- if cmd.strip().startswith("cd "):
1142
- cd_parts = cmd.split(None, 1)
1143
- if len(cd_parts) >= 2:
1144
- target_dir = cd_parts[1].strip('"\'')
1145
- print(f"🔍 Checking if directory exists: {target_dir}")
1146
- try:
1147
- check_result = subprocess.run(f"test -d '{target_dir}'", shell=True,
1148
- capture_output=True, text=True)
1149
- if check_result.returncode != 0:
1150
- print(f"⚠️ Directory does not exist: {target_dir}")
1151
- print(f"🔍 Current directory contents:")
1152
- subprocess.run("pwd && ls -la", shell=True, check=False)
1153
-
1154
- # Try to find similar directories
1155
- print(f"🔍 Looking for similar directories...")
1156
- subprocess.run("find . -type d -name '*llama*' -o -name '*nano*' 2>/dev/null | head -10", shell=True, check=False)
1157
- except Exception as e:
1158
- print(f"⚠️ Could not check directory: {e}")
1159
-
1160
- success, stdout, stderr = run_command_with_llm_debug(cmd, show_output=True)
1161
- if not success:
1162
- print(f"⚠️ Command {i} failed, but continuing with remaining commands...")
1163
-
1164
- # If this was a cd command that failed, try to understand the directory structure
1165
- if cmd.strip().startswith("cd ") and "No such file or directory" in stderr:
1166
- print(f"🔍 Analyzing directory structure after failed cd command...")
1167
- subprocess.run("pwd && ls -la && echo '--- Parent directory ---' && ls -la ..", shell=True, check=False)
1168
-
1169
- # Get container info
1170
- print("🔍 Container started successfully!")
1171
- print(f"🆔 Container ID: {os.environ.get('MODAL_TASK_ID', 'unknown')}")
1172
-
1173
- # Keep the container running
1174
- while True:
1175
- time.sleep(30)
1176
- # Check if SSH service is still running
1177
- try:
1178
- subprocess.run(["service", "ssh", "status"], check=True,
1179
- capture_output=True)
1180
- except subprocess.CalledProcessError:
1181
- print("⚠️ SSH service stopped, restarting...")
1182
- subprocess.run(["service", "ssh", "start"], check=True)
1183
971
 
1184
972
  # Now modify the create_modal_ssh_container function to use the standalone ssh_container_function
1185
973
  def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_commands=None,
@@ -1400,7 +1188,7 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1400
1188
  .apt_install(
1401
1189
  "openssh-server", "sudo", "curl", "wget", "vim", "htop", "git",
1402
1190
  "python3", "python3-pip", "build-essential", "tmux", "screen", "nano",
1403
- "gpg", "ca-certificates", "software-properties-common"
1191
+ "gpg", "ca-certificates", "software-properties-common", "nvtop"
1404
1192
  )
1405
1193
  .pip_install("uv", "modal", "requests", "openai") # Fast Python package installer and Modal
1406
1194
  .run_commands(
@@ -1409,6 +1197,9 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1409
1197
  "mkdir -p /root/.ssh",
1410
1198
  "chmod 700 /root/.ssh",
1411
1199
 
1200
+ # Generate SSH host keys
1201
+ "ssh-keygen -A",
1202
+
1412
1203
  # Configure SSH server
1413
1204
  "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
1414
1205
  "sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config",
@@ -1418,8 +1209,9 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1418
1209
  "echo 'ClientAliveInterval 60' >> /etc/ssh/sshd_config",
1419
1210
  "echo 'ClientAliveCountMax 3' >> /etc/ssh/sshd_config",
1420
1211
 
1421
- # Generate SSH host keys
1422
- "ssh-keygen -A",
1212
+ # Set up CUDA environment
1213
+ "echo 'export PATH=/usr/local/cuda/bin:$PATH' >> /root/.bashrc",
1214
+ "echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> /root/.bashrc",
1423
1215
 
1424
1216
  # Set up a nice bash prompt
1425
1217
  "echo 'export PS1=\"\\[\\e[1;32m\\]modal:\\[\\e[1;34m\\]\\w\\[\\e[0m\\]$ \"' >> /root/.bashrc",
@@ -1435,7 +1227,7 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1435
1227
  if volume:
1436
1228
  volumes_config[volume_mount_path] = volume
1437
1229
 
1438
- # Define the SSH container function
1230
+ # Define the SSH container function - simplified like the example
1439
1231
  @app.function(
1440
1232
  image=ssh_image,
1441
1233
  timeout=timeout_minutes * 60, # Convert to seconds
@@ -1445,7 +1237,7 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1445
1237
  serialized=True,
1446
1238
  volumes=volumes_config if volumes_config else None,
1447
1239
  )
1448
- def ssh_container_function(ssh_password=None, repo_url=None, repo_name=None, setup_commands=None, openai_api_key=None):
1240
+ def start_ssh():
1449
1241
  """Start SSH container with password authentication and optional setup."""
1450
1242
  import subprocess
1451
1243
  import time
@@ -1455,14 +1247,25 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1455
1247
  subprocess.run(["bash", "-c", f"echo 'root:{ssh_password}' | chpasswd"], check=True)
1456
1248
 
1457
1249
  # Set OpenAI API key if provided
1250
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
1458
1251
  if openai_api_key:
1459
1252
  os.environ['OPENAI_API_KEY'] = openai_api_key
1460
1253
  print(f"✅ Set OpenAI API key in container environment (length: {len(openai_api_key)})")
1461
1254
  else:
1462
1255
  print("⚠️ No OpenAI API key provided to container")
1463
1256
 
1464
- # Start SSH service
1465
- subprocess.run(["service", "ssh", "start"], check=True)
1257
+ # Start SSH service using Popen (non-blocking) like in the example
1258
+ subprocess.Popen(["/usr/sbin/sshd", "-D"])
1259
+ time.sleep(2) # Give SSH time to start
1260
+
1261
+ # Test CUDA setup
1262
+ try:
1263
+ print("🔧 Testing CUDA setup...")
1264
+ subprocess.run(["nvidia-smi"], check=True)
1265
+ subprocess.run(["nvcc", "--version"], check=True)
1266
+ print("✅ CUDA setup verified")
1267
+ except subprocess.CalledProcessError as e:
1268
+ print(f"⚠️ CUDA test failed: {e}")
1466
1269
 
1467
1270
  # Clone repository if provided
1468
1271
  if repo_url:
@@ -1590,43 +1393,28 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
1590
1393
  print(f"🔍 Analyzing directory structure after failed cd command...")
1591
1394
  subprocess.run("pwd && ls -la && echo '--- Parent directory ---' && ls -la ..", shell=True, check=False)
1592
1395
 
1593
- # Create SSH tunnel
1594
- with modal.forward(22, unencrypted=True) as tunnel:
1595
- host, port = tunnel.tcp_socket
1396
+ # Forward SSH port and keep container alive - exactly like the example
1397
+ with modal.forward(port=22, unencrypted=True) as tunnel:
1398
+ hostname, port = tunnel.tcp_socket
1596
1399
 
1597
1400
  print("\n" + "=" * 80)
1598
1401
  print("🎉 SSH CONTAINER IS READY!")
1599
1402
  print("=" * 80)
1600
- print(f"🌐 SSH Host: {host}")
1601
- print(f"🔌 SSH Port: {port}")
1602
- print(f"👤 Username: root")
1603
- print(f"🔐 Password: {ssh_password}")
1604
- print()
1605
- print("🔗 CONNECT USING THIS COMMAND:")
1606
- print(f"ssh -p {port} root@{host}")
1403
+ print(f"SSH: ssh -p {port} root@{hostname}")
1404
+ print(f"Password: {ssh_password}")
1607
1405
  print("=" * 80)
1608
1406
 
1609
- # Keep the container running
1407
+ # Keep alive - simplified like the example
1610
1408
  while True:
1611
- time.sleep(30)
1612
- # Check if SSH service is still running
1613
- try:
1614
- subprocess.run(["service", "ssh", "status"], check=True,
1615
- capture_output=True)
1616
- except subprocess.CalledProcessError:
1617
- print("⚠️ SSH service stopped, restarting...")
1618
- subprocess.run(["service", "ssh", "start"], check=True)
1409
+ time.sleep(60)
1619
1410
 
1620
1411
  # Run the container
1621
1412
  try:
1622
1413
  print("⏳ Starting container... This may take 1-2 minutes...")
1623
1414
 
1624
- # Start the container in a new thread to avoid blocking
1625
- with modal.enable_output():
1626
- with app.run():
1627
- # Get the API key from environment
1628
- api_key = os.environ.get("OPENAI_API_KEY")
1629
- ssh_container_function.remote(ssh_password, repo_url, repo_name, setup_commands, api_key)
1415
+ # Start the container - simplified like the example
1416
+ with app.run():
1417
+ start_ssh.remote()
1630
1418
 
1631
1419
  # Clean up Modal token after container is successfully created
1632
1420
  cleanup_modal_token()