skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/backend_utils.py +62 -45
  4. sky/backends/cloud_vm_ray_backend.py +19 -5
  5. sky/check.py +398 -171
  6. sky/cli.py +302 -98
  7. sky/client/cli.py +302 -98
  8. sky/client/sdk.py +104 -12
  9. sky/clouds/__init__.py +3 -0
  10. sky/clouds/aws.py +4 -2
  11. sky/clouds/azure.py +4 -2
  12. sky/clouds/cloud.py +24 -6
  13. sky/clouds/cudo.py +2 -1
  14. sky/clouds/do.py +2 -1
  15. sky/clouds/fluidstack.py +2 -1
  16. sky/clouds/gcp.py +23 -5
  17. sky/clouds/ibm.py +4 -2
  18. sky/clouds/kubernetes.py +66 -22
  19. sky/clouds/lambda_cloud.py +2 -1
  20. sky/clouds/nebius.py +18 -2
  21. sky/clouds/oci.py +4 -2
  22. sky/clouds/paperspace.py +2 -1
  23. sky/clouds/runpod.py +2 -1
  24. sky/clouds/scp.py +2 -1
  25. sky/clouds/service_catalog/constants.py +1 -1
  26. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  27. sky/clouds/ssh.py +203 -0
  28. sky/clouds/vast.py +2 -1
  29. sky/clouds/vsphere.py +2 -1
  30. sky/core.py +58 -11
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  35. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  37. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  52. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
  53. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  54. sky/dashboard/out/clusters/[cluster].html +1 -1
  55. sky/dashboard/out/clusters.html +1 -1
  56. sky/dashboard/out/index.html +1 -1
  57. sky/dashboard/out/infra.html +1 -1
  58. sky/dashboard/out/jobs/[job].html +1 -1
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -0
  61. sky/dashboard/out/workspaces.html +1 -0
  62. sky/data/storage.py +1 -1
  63. sky/global_user_state.py +42 -19
  64. sky/jobs/constants.py +1 -1
  65. sky/jobs/server/core.py +72 -56
  66. sky/jobs/state.py +26 -5
  67. sky/jobs/utils.py +65 -13
  68. sky/optimizer.py +29 -7
  69. sky/provision/__init__.py +1 -0
  70. sky/provision/aws/instance.py +17 -1
  71. sky/provision/fluidstack/instance.py +1 -0
  72. sky/provision/kubernetes/instance.py +16 -5
  73. sky/provision/kubernetes/utils.py +37 -19
  74. sky/provision/nebius/instance.py +3 -1
  75. sky/provision/nebius/utils.py +14 -2
  76. sky/provision/ssh/__init__.py +18 -0
  77. sky/resources.py +4 -1
  78. sky/serve/server/core.py +9 -6
  79. sky/server/html/token_page.html +6 -1
  80. sky/server/requests/executor.py +1 -0
  81. sky/server/requests/payloads.py +18 -0
  82. sky/server/server.py +108 -5
  83. sky/setup_files/dependencies.py +1 -0
  84. sky/skylet/constants.py +4 -1
  85. sky/skypilot_config.py +83 -9
  86. sky/templates/nebius-ray.yml.j2 +12 -0
  87. sky/utils/cli_utils/status_utils.py +18 -8
  88. sky/utils/infra_utils.py +21 -1
  89. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  90. sky/utils/kubernetes/create_cluster.sh +1 -0
  91. sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
  92. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  93. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  94. sky/utils/log_utils.py +218 -1
  95. sky/utils/schemas.py +75 -0
  96. sky/utils/ux_utils.py +2 -1
  97. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
  98. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
  99. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  101. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  102. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  103. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  106. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  114. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  115. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  116. /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,12 @@
2
2
  import os
3
3
  import shlex
4
4
  import subprocess
5
+ import sys
5
6
  import tempfile
6
7
  from typing import List, Optional
7
8
 
9
+ import colorama
10
+
8
11
  from sky import check as sky_check
9
12
  from sky import sky_logging
10
13
  from sky.backends import backend_utils
@@ -19,6 +22,103 @@ from sky.utils import ux_utils
19
22
 
20
23
  logger = sky_logging.init_logger(__name__)
21
24
 
25
+ # Default path for Kubernetes configuration file
26
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
27
+
28
+
29
+ def deploy_ssh_cluster(cleanup: bool = False,
30
+ infra: Optional[str] = None,
31
+ kubeconfig_path: Optional[str] = None):
32
+ """Deploy a Kubernetes cluster on SSH targets.
33
+
34
+ This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
35
+ Kubernetes cluster on the specified machines.
36
+
37
+ Args:
38
+ cleanup: Whether to clean up the cluster instead of deploying.
39
+ infra: Name of the cluster in ssh_node_pools.yaml to use.
40
+ If None, the first cluster in the file will be used.
41
+ kubeconfig_path: Path to save the Kubernetes configuration file.
42
+ If None, the default ~/.kube/config will be used.
43
+ """
44
+ # Prepare command to call deploy_remote_cluster.py script
45
+ # TODO(romilb): We should move this to a native python method/class call
46
+ # instead of invoking a script with subprocess.
47
+ path_to_package = os.path.dirname(__file__)
48
+ up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
49
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
50
+
51
+ deploy_command = [sys.executable, up_script_path]
52
+
53
+ if cleanup:
54
+ deploy_command.append('--cleanup')
55
+
56
+ if infra:
57
+ deploy_command.extend(['--infra', infra])
58
+
59
+ # Use the default kubeconfig path if none is provided
60
+ kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
61
+ deploy_command.extend(['--kubeconfig-path', kubeconfig_path])
62
+
63
+ # Setup logging paths
64
+ run_timestamp = sky_logging.get_run_timestamp()
65
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
66
+ 'ssh_up.log')
67
+
68
+ if cleanup:
69
+ msg_str = 'Cleaning up SSH Node Pools...'
70
+ else:
71
+ msg_str = 'Initializing deployment to SSH Node Pools...'
72
+
73
+ # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
74
+ env = os.environ.copy()
75
+ env['PYTHONUNBUFFERED'] = '1'
76
+
77
+ with rich_utils.safe_status(
78
+ ux_utils.spinner_message(msg_str, log_path=log_path,
79
+ is_local=True)):
80
+ returncode, _, stderr = log_lib.run_with_log(
81
+ cmd=deploy_command,
82
+ log_path=log_path,
83
+ require_outputs=True,
84
+ stream_logs=False, # TODO: Fixme to False after we fix the logging
85
+ line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
86
+ is_local=True),
87
+ cwd=cwd,
88
+ env=env)
89
+
90
+ if returncode == 0:
91
+ success = True
92
+ else:
93
+ with ux_utils.print_exception_no_traceback():
94
+ log_hint = ux_utils.log_path_hint(log_path, is_local=True)
95
+ raise RuntimeError('Failed to deploy SkyPilot on SSH targets. '
96
+ f'Full log: {log_hint}'
97
+ f'\nError: {stderr}')
98
+
99
+ if success:
100
+ # Add an empty line to separate the deployment logs from the final
101
+ # message
102
+ logger.info('')
103
+ if cleanup:
104
+ logger.info(
105
+ ux_utils.finishing_message(
106
+ '🎉 SSH Node Pools cleaned up successfully.',
107
+ log_path=log_path,
108
+ is_local=True))
109
+ else:
110
+ logger.info(
111
+ ux_utils.finishing_message(
112
+ '🎉 SSH Node Pools set up successfully. ',
113
+ follow_up_message=(
114
+ f'Run `{colorama.Style.BRIGHT}'
115
+ f'sky check ssh'
116
+ f'{colorama.Style.RESET_ALL}` to verify access, '
117
+ f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
118
+ f'{colorama.Style.RESET_ALL}` to launch a cluster. '),
119
+ log_path=log_path,
120
+ is_local=True))
121
+
22
122
 
23
123
  def deploy_remote_cluster(ip_list: List[str],
24
124
  ssh_user: str,
@@ -28,7 +128,7 @@ def deploy_remote_cluster(ip_list: List[str],
28
128
  password: Optional[str] = None):
29
129
  success = False
30
130
  path_to_package = os.path.dirname(__file__)
31
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
131
+ up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
32
132
  # Get directory of script and run it from there
33
133
  cwd = os.path.dirname(os.path.abspath(up_script_path))
34
134
 
@@ -44,17 +144,18 @@ def deploy_remote_cluster(ip_list: List[str],
44
144
  key_file.flush()
45
145
  os.chmod(key_file.name, 0o600)
46
146
 
47
- deploy_command = (f'{up_script_path} {ip_file.name} '
48
- f'{ssh_user} {key_file.name}')
147
+ # Use the legacy mode command line arguments for backward compatibility
148
+ deploy_command = [
149
+ sys.executable, up_script_path, '--ips-file', ip_file.name,
150
+ '--user', ssh_user, '--ssh-key', key_file.name
151
+ ]
152
+
49
153
  if context_name is not None:
50
- deploy_command += f' {context_name}'
154
+ deploy_command.extend(['--context-name', context_name])
51
155
  if password is not None:
52
- deploy_command += f' --password {password}'
156
+ deploy_command.extend(['--password', password])
53
157
  if cleanup:
54
- deploy_command += ' --cleanup'
55
-
56
- # Convert the command to a format suitable for subprocess
57
- deploy_command = shlex.split(deploy_command)
158
+ deploy_command.append('--cleanup')
58
159
 
59
160
  # Setup logging paths
60
161
  run_timestamp = sky_logging.get_run_timestamp()
@@ -65,6 +166,11 @@ def deploy_remote_cluster(ip_list: List[str],
65
166
  msg_str = 'Cleaning up remote cluster...'
66
167
  else:
67
168
  msg_str = 'Deploying remote cluster...'
169
+
170
+ # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
171
+ env = os.environ.copy()
172
+ env['PYTHONUNBUFFERED'] = '1'
173
+
68
174
  with rich_utils.safe_status(
69
175
  ux_utils.spinner_message(msg_str,
70
176
  log_path=log_path,
@@ -76,7 +182,8 @@ def deploy_remote_cluster(ip_list: List[str],
76
182
  stream_logs=False,
77
183
  line_processor=log_utils.SkyRemoteUpLineProcessor(
78
184
  log_path=log_path, is_local=True),
79
- cwd=cwd)
185
+ cwd=cwd,
186
+ env=env)
80
187
  if returncode == 0:
81
188
  success = True
82
189
  else:
@@ -0,0 +1,387 @@
1
+ #!/bin/bash
2
+ # ssh-tunnel.sh - SSH tunnel script for Kubernetes API access
3
+ # Used as kubectl exec credential plugin to establish SSH tunnel on demand.
4
+ # Returns a valid credential format for kubectl with expiration. The expiration
5
+ # is calculated based on the TTL argument and is required to force kubectl to
6
+ # check the tunnel status frequently.
7
+
8
+ # Usage: ssh-tunnel.sh --host HOST [--user USER] [--use-ssh-config] [--ssh-key KEY] [--context CONTEXT] [--port PORT] [--ttl SECONDS]
9
+
10
+ # Default time-to-live for credential in seconds
11
+ # This forces kubectl to check the tunnel status frequently
12
+ TTL_SECONDS=30
13
+
14
+ # Parse arguments
15
+ USE_SSH_CONFIG=0
16
+ SSH_KEY=""
17
+ CONTEXT=""
18
+ HOST=""
19
+ USER=""
20
+ PORT=6443 # Default port if not specified
21
+
22
+ # Debug log to ~/.sky/ssh_node_pools_info/$CONTEXT-tunnel.log
23
+ debug_log() {
24
+ local message="$(date): $1"
25
+ echo "$message" >> "$LOG_FILE"
26
+ }
27
+
28
+ # Generate expiration timestamp for credential
29
+ generate_expiration_timestamp() {
30
+ # Try macOS date format first, fallback to Linux format
31
+ date -u -v+${TTL_SECONDS}S +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -d "+${TTL_SECONDS} seconds" +"%Y-%m-%dT%H:%M:%SZ"
32
+ }
33
+
34
+ # Acquire the lock, return 0 if successful, 1 if another process is already holding the lock
35
+ acquire_lock() {
36
+ # Check for flock command
37
+ if ! command -v flock >/dev/null 2>&1; then
38
+ debug_log "flock command not available, using alternative lock mechanism"
39
+ # Simple file-based locking
40
+ if [ -f "$LOCK_FILE" ]; then
41
+ lock_pid=$(cat "$LOCK_FILE" 2>/dev/null)
42
+ if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
43
+ debug_log "Another process ($lock_pid) is starting the tunnel, waiting briefly"
44
+ return 1
45
+ else
46
+ # Stale lock file
47
+ debug_log "Removing stale lock file"
48
+ rm -f "$LOCK_FILE"
49
+ fi
50
+ fi
51
+ # Create our lock
52
+ echo $$ > "$LOCK_FILE"
53
+ return 0
54
+ else
55
+ # Use flock for better locking
56
+ exec 9>"$LOCK_FILE"
57
+ if ! flock -n 9; then
58
+ debug_log "Another process is starting the tunnel, waiting briefly"
59
+ return 1
60
+ fi
61
+ return 0
62
+ fi
63
+ }
64
+
65
+ # Release the lock
66
+ release_lock() {
67
+ if command -v flock >/dev/null 2>&1; then
68
+ # Using flock
69
+ exec 9>&- # Close file descriptor to release lock
70
+ else
71
+ # Using simple lock
72
+ rm -f "$LOCK_FILE"
73
+ fi
74
+ debug_log "Lock released"
75
+ }
76
+
77
+ # Generate SSH command based on available tools and parameters
78
+ generate_ssh_command() {
79
+ # Check for autossh
80
+ if ! command -v autossh >/dev/null 2>&1; then
81
+ debug_log "WARNING: autossh is not installed but recommended for reliable SSH tunnels"
82
+ debug_log "Install autossh: brew install autossh (macOS), apt-get install autossh (Ubuntu/Debian)"
83
+
84
+ # Fall back to regular ssh
85
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
86
+ SSH_CMD=("ssh" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
87
+ else
88
+ SSH_CMD=("ssh" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
89
+
90
+ # Add SSH key if provided
91
+ if [[ -n "$SSH_KEY" ]]; then
92
+ SSH_CMD+=("-i" "$SSH_KEY")
93
+ fi
94
+
95
+ # Add user@host
96
+ SSH_CMD+=("$USER@$HOST")
97
+ fi
98
+ else
99
+ # Configure autossh
100
+ if [[ $USE_SSH_CONFIG -eq 1 ]]; then
101
+ SSH_CMD=("autossh" "-M" "0" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N" "$HOST")
102
+ else
103
+ SSH_CMD=("autossh" "-M" "0" "-o" "StrictHostKeyChecking=no" "-o" "IdentitiesOnly=yes" "-o" "ServerAliveInterval=30" "-o" "ServerAliveCountMax=3" "-o" "ExitOnForwardFailure=yes" "-L" "$PORT:127.0.0.1:6443" "-N")
104
+
105
+ # Add SSH key if provided
106
+ if [[ -n "$SSH_KEY" ]]; then
107
+ SSH_CMD+=("-i" "$SSH_KEY")
108
+ fi
109
+
110
+ # Add user@host
111
+ SSH_CMD+=("$USER@$HOST")
112
+ fi
113
+ fi
114
+ }
115
+
116
+ # Function to read certificate files if they exist
117
+ read_certificate_data() {
118
+ local client_cert_file="$TUNNEL_DIR/$CONTEXT-cert.pem"
119
+ local client_key_file="$TUNNEL_DIR/$CONTEXT-key.pem"
120
+ local cert_data=""
121
+ local key_data=""
122
+
123
+ if [[ -f "$client_cert_file" ]]; then
124
+ # Read the certificate file as is - it's already in PEM format
125
+ cert_data=$(cat "$client_cert_file")
126
+ debug_log "Found client certificate data for context $CONTEXT"
127
+
128
+ # Log the first and last few characters to verify PEM format
129
+ local cert_start=$(head -1 "$client_cert_file")
130
+ local cert_end=$(tail -1 "$client_cert_file")
131
+ debug_log "Certificate starts with: $cert_start"
132
+ debug_log "Certificate ends with: $cert_end"
133
+
134
+ # Check if it has proper PEM format
135
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file" || ! grep -q "END CERTIFICATE" "$client_cert_file"; then
136
+ debug_log "WARNING: Certificate file may not be in proper PEM format"
137
+ # Try to fix it if needed
138
+ if ! grep -q "BEGIN CERTIFICATE" "$client_cert_file"; then
139
+ echo "-----BEGIN CERTIFICATE-----" > "$client_cert_file.fixed"
140
+ cat "$client_cert_file" >> "$client_cert_file.fixed"
141
+ echo "-----END CERTIFICATE-----" >> "$client_cert_file.fixed"
142
+ mv "$client_cert_file.fixed" "$client_cert_file"
143
+ cert_data=$(cat "$client_cert_file")
144
+ debug_log "Fixed certificate format by adding BEGIN/END markers"
145
+ fi
146
+ fi
147
+ fi
148
+
149
+ if [[ -f "$client_key_file" ]]; then
150
+ # Read the key file as is - it's already in PEM format
151
+ key_data=$(cat "$client_key_file")
152
+ debug_log "Found client key data for context $CONTEXT"
153
+
154
+ # Log the first and last few characters to verify PEM format
155
+ local key_start=$(head -1 "$client_key_file")
156
+ local key_end=$(tail -1 "$client_key_file")
157
+ debug_log "Key starts with: $key_start"
158
+ debug_log "Key ends with: $key_end"
159
+
160
+ # Check if it has proper PEM format
161
+ if ! grep -q "BEGIN" "$client_key_file" || ! grep -q "END" "$client_key_file"; then
162
+ debug_log "WARNING: Key file may not be in proper PEM format"
163
+ # Try to fix it if needed
164
+ if ! grep -q "BEGIN" "$client_key_file"; then
165
+ echo "-----BEGIN PRIVATE KEY-----" > "$client_key_file.fixed"
166
+ cat "$client_key_file" >> "$client_key_file.fixed"
167
+ echo "-----END PRIVATE KEY-----" >> "$client_key_file.fixed"
168
+ mv "$client_key_file.fixed" "$client_key_file"
169
+ key_data=$(cat "$client_key_file")
170
+ debug_log "Fixed key format by adding BEGIN/END markers"
171
+ fi
172
+ fi
173
+ fi
174
+
175
+ echo "$cert_data:$key_data"
176
+ }
177
+
178
+ # Function to generate credentials JSON
179
+ generate_credentials_json() {
180
+ local expiration_time=$(generate_expiration_timestamp)
181
+ local cert_bundle=$(read_certificate_data)
182
+ local client_cert_data=${cert_bundle%:*}
183
+ local client_key_data=${cert_bundle#*:}
184
+
185
+ if [[ -n "$client_cert_data" && -n "$client_key_data" ]]; then
186
+ # Debug the certificate data
187
+ debug_log "Certificate data length: $(echo -n "$client_cert_data" | wc -c) bytes"
188
+ debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
+
190
+ # Check if we can create proper JSON with `jq`
191
+ if command -v jq &>/dev/null; then
192
+ debug_log "Using jq for JSON formatting"
193
+
194
+ # Create a temporary file for the JSON output to avoid shell escaping issues
195
+ local TEMP_JSON_FILE=$(mktemp)
196
+
197
+ # Write the JSON to the temporary file using jq for proper JSON formatting
198
+ cat > "$TEMP_JSON_FILE" << EOL
199
+ {
200
+ "apiVersion": "client.authentication.k8s.io/v1beta1",
201
+ "kind": "ExecCredential",
202
+ "status": {
203
+ "clientCertificateData": $(printf '%s' "$client_cert_data" | jq -R -s .),
204
+ "clientKeyData": $(printf '%s' "$client_key_data" | jq -R -s .),
205
+ "expirationTimestamp": "$expiration_time"
206
+ }
207
+ }
208
+ EOL
209
+
210
+ # Read the JSON from the file
211
+ local json_response=$(cat "$TEMP_JSON_FILE")
212
+
213
+ # Clean up
214
+ rm -f "$TEMP_JSON_FILE"
215
+
216
+ # Output the JSON
217
+ echo "$json_response"
218
+ else
219
+ debug_log "jq is not available, using simpler formatting method"
220
+
221
+ # Alternative approach: encode with base64 and use the token field instead
222
+ # This works because kubectl will decode token data properly
223
+ local combined_data=$(echo -n "${client_cert_data}:${client_key_data}" | base64 | tr -d '\n')
224
+
225
+ echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"$combined_data\",\"expirationTimestamp\":\"$expiration_time\"}}"
226
+
227
+ debug_log "Sent certificate data as encoded token instead of direct certificate fields"
228
+ fi
229
+ else
230
+ # Fallback to token-based credential for tunnel-only authentication
231
+ echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
232
+ fi
233
+ }
234
+
235
+ while [[ $# -gt 0 ]]; do
236
+ case $1 in
237
+ --use-ssh-config)
238
+ USE_SSH_CONFIG=1
239
+ shift
240
+ ;;
241
+ --ssh-key)
242
+ SSH_KEY="$2"
243
+ shift 2
244
+ ;;
245
+ --context)
246
+ CONTEXT="$2"
247
+ shift 2
248
+ ;;
249
+ --port)
250
+ PORT="$2"
251
+ shift 2
252
+ ;;
253
+ --host)
254
+ HOST="$2"
255
+ shift 2
256
+ ;;
257
+ --user)
258
+ USER="$2"
259
+ shift 2
260
+ ;;
261
+ --ttl)
262
+ TTL_SECONDS="$2"
263
+ shift 2
264
+ ;;
265
+ *)
266
+ echo "Unknown parameter: $1" >&2
267
+ exit 1
268
+ ;;
269
+ esac
270
+ done
271
+
272
+ # Validate required parameters
273
+ if [[ -z "$HOST" ]]; then
274
+ echo "Error: --host parameter is required" >&2
275
+ exit 1
276
+ fi
277
+
278
+ # Setup directories
279
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
280
+ mkdir -p "$TUNNEL_DIR"
281
+
282
+ # Get context name for PID file
283
+ if [[ -z "$CONTEXT" ]]; then
284
+ CONTEXT="default"
285
+ fi
286
+
287
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
288
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
289
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
290
+
291
+ debug_log "Starting ssh-tunnel.sh for context $CONTEXT, host $HOST, port $PORT"
292
+ debug_log "SSH Config: $USE_SSH_CONFIG, User: $USER, TTL: ${TTL_SECONDS}s"
293
+
294
+ # Check if specified port is already in use (tunnel may be running)
295
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
296
+ debug_log "Port $PORT already in use, checking if it's our tunnel"
297
+
298
+ # Check if there's a PID file and if that process is running
299
+ if [[ -f "$PID_FILE" ]]; then
300
+ OLD_PID=$(cat "$PID_FILE")
301
+ if kill -0 "$OLD_PID" 2>/dev/null; then
302
+ debug_log "Tunnel appears to be running with PID $OLD_PID"
303
+ else
304
+ debug_log "PID file exists but process $OLD_PID is not running"
305
+ fi
306
+ else
307
+ debug_log "Port $PORT is in use but no PID file exists"
308
+ fi
309
+
310
+ # Return valid credential format for kubectl with expiration
311
+ generate_credentials_json
312
+ exit 0
313
+ fi
314
+
315
+ # Try to acquire the lock
316
+ if ! acquire_lock; then
317
+ # Wait briefly for the tunnel to be established
318
+ for i in {1..10}; do
319
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
320
+ debug_log "Tunnel is now active"
321
+
322
+ # Return valid credential format for kubectl with expiration
323
+ generate_credentials_json
324
+ exit 0
325
+ fi
326
+ sleep 0.2
327
+ done
328
+ debug_log "Waited for tunnel but port $PORT still not available"
329
+ fi
330
+
331
+ # Check if we have a PID file with running process
332
+ if [[ -f "$PID_FILE" ]]; then
333
+ OLD_PID=$(cat "$PID_FILE")
334
+ if kill -0 "$OLD_PID" 2>/dev/null; then
335
+ # Process exists but port isn't open - something's wrong, kill it
336
+ kill "$OLD_PID" 2>/dev/null
337
+ debug_log "Killed stale tunnel process $OLD_PID"
338
+ else
339
+ debug_log "PID file exists but process $OLD_PID is not running anymore"
340
+ fi
341
+ # Remove the stale PID file
342
+ rm -f "$PID_FILE"
343
+ fi
344
+
345
+ # Generate the SSH command
346
+ generate_ssh_command
347
+
348
+ debug_log "Starting SSH tunnel: ${SSH_CMD[*]}"
349
+
350
+ # Start the tunnel in foreground and wait for it to establish
351
+ "${SSH_CMD[@]}" >> "$LOG_FILE" 2>&1 &
352
+ TUNNEL_PID=$!
353
+
354
+ # Save PID
355
+ echo $TUNNEL_PID > "$PID_FILE"
356
+ debug_log "Tunnel started with PID $TUNNEL_PID"
357
+
358
+ # Wait for tunnel to establish
359
+ tunnel_up=0
360
+ for i in {1..20}; do
361
+ if nc -z 127.0.0.1 "$PORT" 2>/dev/null; then
362
+ debug_log "Tunnel established successfully on port $PORT"
363
+ tunnel_up=1
364
+ break
365
+ fi
366
+ sleep 0.2
367
+ done
368
+
369
+ # Clean up lock file
370
+ release_lock
371
+
372
+ # Check if the tunnel process is still running
373
+ if ! kill -0 $TUNNEL_PID 2>/dev/null; then
374
+ debug_log "ERROR: Tunnel process exited unexpectedly! Check logs for details"
375
+ if [[ -f "$PID_FILE" ]]; then
376
+ rm -f "$PID_FILE"
377
+ fi
378
+ # Return error in case of tunnel failure
379
+ echo "Failed to establish SSH tunnel. See $TUNNEL_DIR/$CONTEXT-tunnel.log for details." >&2
380
+ exit 1
381
+ elif [[ $tunnel_up -eq 0 ]]; then
382
+ debug_log "WARNING: Tunnel process is running but port $PORT is not responding"
383
+ fi
384
+
385
+ # Return valid credential format with certificates if available
386
+ generate_credentials_json
387
+ exit 0