skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,139 @@
1
+ """Utility functions for generating instance links for cloud providers."""
2
+ from typing import Dict
3
+
4
+ from sky import sky_logging
5
+ from sky.provision import common
6
+ from sky.provision import constants as provision_constants
7
+
8
+ logger = sky_logging.init_logger(__name__)
9
+
10
+ # URL templates for each cloud provider
11
+ # Placeholders:
12
+ # {region} - Cloud region
13
+ # {project_id} - GCP project ID
14
+ # {subscription_id} - Azure subscription ID
15
+ # {resource_group} - Azure resource group
16
+ # {tag_key} - Tag key used to identify cluster instances
17
+ # {cluster_name} - Name of the cluster
18
+
19
+ AWS_INSTANCES_URL = ('https://{region}.console.aws.amazon.com/ec2/v2/home'
20
+ '?region={region}#Instances:tag:{tag_key}={cluster_name}')
21
+
22
+ # Azure doesn't support direct tag filter URLs, so we link to the resource group
23
+ AZURE_RESOURCE_GROUP_URL = (
24
+ 'https://portal.azure.com/#@/resource/subscriptions'
25
+ '/{subscription_id}/resourceGroups/{resource_group}/overview')
26
+
27
+ # GCP Console base URL
28
+ GCP_INSTANCES_BASE_URL = 'https://console.cloud.google.com/compute/instances'
29
+
30
+
31
+ def _build_gcp_instances_url(project_id: str, tag_key: str,
32
+ cluster_name: str) -> str:
33
+ """Build GCP instances URL with label filter.
34
+
35
+ GCP Console uses a pageState parameter with a specially encoded filter.
36
+ The filter JSON structure is:
37
+ [{"k":"","t":10,"v":"\"label_key:label_value\"","s":true}]
38
+
39
+ Where:
40
+ - k: filter key (empty for label filters)
41
+ - t: filter type (10 = label filter)
42
+ - v: filter value with escaped quotes around "label_key:label_value"
43
+ - s: unknown, always true
44
+
45
+ GCP uses a mix of:
46
+ - Standard URL encoding for outer structure (%22 for ")
47
+ - Underscore notation inside the filter (_22 for ", _3A for :, etc.)
48
+ - Double URL-encoding for brackets (%255B = %5B = [)
49
+ """
50
+ # Build the filter value: \"tag_key:cluster_name\"
51
+ # Using underscore notation: _5C_22 = \", _3A = :
52
+ filter_value = f'_5C_22{tag_key}_3A{cluster_name}_5C_22'
53
+
54
+ # Build the filter object using underscore notation for internal quotes and
55
+ # colons.
56
+ # {"k":"","t":10,"v":"<filter_value>","s":true}
57
+ # _22 = ", _3A = :, _2C = ,
58
+ filter_obj = (
59
+ f'_22k_22_3A_22_22_2C' # "k":"",
60
+ f'_22t_22_3A10_2C' # "t":10,
61
+ f'_22v_22_3A_22{filter_value}_22_2C' # "v":"<value>",
62
+ f'_22s_22_3Atrue') # "s":true
63
+
64
+ # Wrap in array brackets (double URL-encoded: %255B = %5B = [, %257D = %7D)
65
+ filter_array = f'%255B%257B{filter_obj}%257D%255D'
66
+
67
+ # Build pageState: ("instances":("p":0,"f":"<filter>"))
68
+ # %22 = " (standard URL encoding)
69
+ page_state = f'(%22instances%22:(%22p%22:0,%22f%22:%22{filter_array}%22))'
70
+
71
+ return (
72
+ f'{GCP_INSTANCES_BASE_URL}?project={project_id}&pageState={page_state}')
73
+
74
+
75
+ def generate_instance_links(
76
+ cluster_info: common.ClusterInfo,
77
+ cluster_name: str,
78
+ ) -> Dict[str, str]:
79
+ """Generate instance links for a cluster based on the cloud provider.
80
+
81
+ Creates links to filtered views in cloud consoles that show all instances
82
+ belonging to the cluster (useful for multi-node jobs).
83
+
84
+ Args:
85
+ cluster_info: ClusterInfo object containing instance information.
86
+ cluster_name: Cluster name for tag-based filtering.
87
+
88
+ Returns:
89
+ Dictionary mapping link labels to URLs. Empty dict if links cannot be
90
+ generated (e.g., for Kubernetes or unsupported clouds).
91
+ """
92
+ links: Dict[str, str] = {}
93
+ provider_name = cluster_info.provider_name.lower()
94
+ provider_config = cluster_info.provider_config or {}
95
+
96
+ # Skip Kubernetes and other non-cloud providers
97
+ if provider_name in ('kubernetes', 'local'):
98
+ return links
99
+
100
+ # Tag used by SkyPilot to identify cluster instances
101
+ tag_key = provision_constants.TAG_RAY_CLUSTER_NAME
102
+
103
+ if provider_name == 'aws':
104
+ region = provider_config.get('region')
105
+ if not region:
106
+ logger.debug('AWS region not found in provider config, '
107
+ 'skipping instance links')
108
+ return links
109
+ links['AWS Instances'] = AWS_INSTANCES_URL.format(
110
+ region=region,
111
+ tag_key=tag_key,
112
+ cluster_name=cluster_name,
113
+ )
114
+
115
+ elif provider_name == 'gcp':
116
+ project_id = provider_config.get('project_id')
117
+ if not project_id:
118
+ logger.debug('GCP project_id not found in provider config, '
119
+ 'skipping instance links')
120
+ return links
121
+ links['GCP Instances'] = _build_gcp_instances_url(
122
+ project_id=project_id,
123
+ tag_key=tag_key,
124
+ cluster_name=cluster_name,
125
+ )
126
+
127
+ elif provider_name == 'azure':
128
+ subscription_id = provider_config.get('subscription_id')
129
+ resource_group = provider_config.get('resource_group')
130
+ if not subscription_id or not resource_group:
131
+ logger.debug('Azure subscription_id or resource_group not found '
132
+ 'in provider config, skipping instance links')
133
+ return links
134
+ links['Azure Resource Group'] = AZURE_RESOURCE_GROUP_URL.format(
135
+ subscription_id=subscription_id,
136
+ resource_group=resource_group,
137
+ )
138
+
139
+ return links
@@ -0,0 +1,49 @@
1
+ """Utilities for server-side interactive SSH functionality."""
2
+ import array
3
+ import socket
4
+
5
+
6
+ def get_pty_socket_path(session_id: str) -> str:
7
+ """Get the Unix socket path for PTY file descriptor passing."""
8
+ return f'/tmp/sky_pty_{session_id}.sock'
9
+
10
+
11
+ def send_fd(sock: socket.socket, fd: int) -> None:
12
+ """Send file descriptor via Unix socket using SCM_RIGHTS.
13
+
14
+ SCM_RIGHTS allows us to send or receive a set of open
15
+ file descriptors from another process.
16
+
17
+ See:
18
+ https://man7.org/linux/man-pages/man7/unix.7.html
19
+ https://man7.org/linux/man-pages/man3/cmsg.3.html
20
+
21
+ Args:
22
+ sock: Connected Unix socket.
23
+ fd: File descriptor to send.
24
+ """
25
+ sock.sendmsg(
26
+ [b'x'], # Dummy data
27
+ [(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array('i', [fd]))])
28
+
29
+
30
+ def recv_fd(sock: socket.socket) -> int:
31
+ """Receive file descriptor via Unix socket using SCM_RIGHTS.
32
+
33
+ Args:
34
+ sock: Connected Unix socket.
35
+
36
+ Returns:
37
+ Received file descriptor.
38
+
39
+ Raises:
40
+ RuntimeError: If no file descriptor was received.
41
+ """
42
+ # NOTE: recvmsg() has no async equivalent
43
+ _, ancdata, _, _ = sock.recvmsg(
44
+ 1, socket.CMSG_SPACE(array.array('i', [0]).itemsize))
45
+ if not ancdata:
46
+ raise RuntimeError('No file descriptor received - '
47
+ 'sender may have closed connection')
48
+ _, _, cmsg_data = ancdata[0]
49
+ return array.array('i', cmsg_data)[0]
@@ -12,20 +12,20 @@
12
12
  # * Specify SKYPILOT_NAMESPACE env var to override the default namespace where the service account is created.
13
13
  # * Specify SKYPILOT_SA_NAME env var to override the default service account name.
14
14
  # * Specify SKIP_SA_CREATION=1 to skip creating the service account and use an existing one
15
- # * Specify SUPER_USER=1 to create a service account with cluster-admin permissions
15
+ # * Specify SUPER_USER=0 to create a service account with minimal permissions
16
16
  #
17
17
  # Usage:
18
- # # Create "sky-sa" service account with minimal permissions in "default" namespace and generate kubeconfig
18
+ # # Create "sky-sa" service account in "default" namespace and generate kubeconfig
19
19
  # $ ./generate_kubeconfig.sh
20
20
  #
21
- # # Create "my-sa" service account with minimal permissions in "my-namespace" namespace and generate kubeconfig
21
+ # # Create "my-sa" service account in "my-namespace" namespace and generate kubeconfig
22
22
  # $ SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
23
23
  #
24
24
  # # Use an existing service account "my-sa" in "my-namespace" namespace and generate kubeconfig
25
25
  # $ SKIP_SA_CREATION=1 SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
26
26
  #
27
- # # Create "sky-sa" service account with cluster-admin permissions in "default" namespace
28
- # $ SUPER_USER=1 ./generate_kubeconfig.sh
27
+ # # Create "sky-sa" service account with minimal permissions in "default" namespace (manual setup may be required)
28
+ # $ SUPER_USER=0 ./generate_kubeconfig.sh
29
29
 
30
30
  set -eu -o pipefail
31
31
 
@@ -33,11 +33,18 @@ set -eu -o pipefail
33
33
  # use default.
34
34
  SKYPILOT_SA=${SKYPILOT_SA_NAME:-sky-sa}
35
35
  NAMESPACE=${SKYPILOT_NAMESPACE:-default}
36
- SUPER_USER=${SUPER_USER:-0}
36
+ SUPER_USER=${SUPER_USER:-1}
37
37
 
38
- echo "Service account: ${SKYPILOT_SA}"
39
- echo "Namespace: ${NAMESPACE}"
40
- echo "Super user permissions: ${SUPER_USER}"
38
+ echo "=========================================="
39
+ echo "SkyPilot Kubeconfig Generation"
40
+ echo "=========================================="
41
+ echo "Service Account: ${SKYPILOT_SA}"
42
+ echo "Namespace: ${NAMESPACE}"
43
+ if [ "${SUPER_USER}" != "1" ]; then
44
+ echo "Permissions: Minimal (manual setup may be required)"
45
+ SUPER_USER=0
46
+ fi
47
+ echo ""
41
48
 
42
49
  # Set OS specific values.
43
50
  if [[ "$OSTYPE" == "linux-gnu" ]]; then
@@ -53,7 +60,7 @@ fi
53
60
 
54
61
  # If the user has set SKIP_SA_CREATION=1, skip creating the service account.
55
62
  if [ -z ${SKIP_SA_CREATION+x} ]; then
56
- echo "Creating the Kubernetes Service Account with ${SUPER_USER:+super user}${SUPER_USER:-minimal} RBAC permissions."
63
+ echo "[1/3] Creating Kubernetes Service Account and RBAC permissions..."
57
64
  if [ "${SUPER_USER}" = "1" ]; then
58
65
  # Create service account with cluster-admin permissions
59
66
  kubectl apply -f - <<EOF
@@ -219,7 +226,8 @@ roleRef:
219
226
  EOF
220
227
  fi
221
228
  # Apply optional ingress-related roles, but don't make the script fail if it fails
222
- kubectl apply -f - <<EOF || echo "Failed to apply optional ingress-related roles. Nginx ingress is likely not installed. This is not critical and the script will continue."
229
+ echo " Applying optional ingress permissions (skipped if ingress-nginx not installed)..."
230
+ kubectl apply -f - 2>/dev/null <<EOF || true
223
231
  # Optional: Role for accessing ingress resources
224
232
  apiVersion: rbac.authorization.k8s.io/v1
225
233
  kind: Role
@@ -253,8 +261,13 @@ roleRef:
253
261
  name: ${SKYPILOT_SA}-role-ingress-nginx # Use the same name as the role at line 119
254
262
  apiGroup: rbac.authorization.k8s.io
255
263
  EOF
264
+ else
265
+ echo "[1/3] Skipping service account creation (using existing account)..."
256
266
  fi
257
267
 
268
+ echo ""
269
+ echo "[2/3] Creating service account token..."
270
+
258
271
  # Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
259
272
  # version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
260
273
  # After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
@@ -293,7 +306,9 @@ CURRENT_CONTEXT=$(kubectl config current-context)
293
306
  CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
294
307
  CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
295
308
 
296
- echo "Writing kubeconfig."
309
+ echo ""
310
+ echo "[3/3] Generating kubeconfig file..."
311
+
297
312
  cat > kubeconfig <<EOF
298
313
  apiVersion: v1
299
314
  clusters:
@@ -316,24 +331,18 @@ users:
316
331
  token: ${SA_TOKEN}
317
332
  EOF
318
333
 
319
- echo "---
320
- Done!
321
-
322
- Kubeconfig using service account '${SKYPILOT_SA}' in namespace '${NAMESPACE}' written at $(pwd)/kubeconfig
323
-
324
- Copy the generated kubeconfig file to your ~/.kube/ directory to use it with
325
- kubectl and skypilot:
326
-
327
- # Backup your existing kubeconfig file
328
- mv ~/.kube/config ~/.kube/config.bak
329
- cp kubeconfig ~/.kube/config
330
-
331
- # Verify that you can access the cluster
332
- kubectl get pods
333
-
334
- Also add this to your ~/.sky/config.yaml to use the new service account:
335
-
336
- # ~/.sky/config.yaml
337
- kubernetes:
338
- remote_identity: ${SKYPILOT_SA}
339
- "
334
+ echo ""
335
+ echo "=========================================="
336
+ echo "✓ SUCCESS!"
337
+ echo "=========================================="
338
+ echo ""
339
+ echo "Kubeconfig file created successfully!"
340
+ echo ""
341
+ echo " Service Account: ${SKYPILOT_SA}"
342
+ echo " Namespace: ${NAMESPACE}"
343
+ echo " Location: $(pwd)/kubeconfig"
344
+ echo ""
345
+ echo "Next steps:"
346
+ echo " Refer to this page for setting up the credential for remote API server:"
347
+ echo " https://docs.skypilot.co/en/latest/reference/api-server/api-server-admin-deploy.html#optional-configure-cloud-accounts"
348
+ echo ""
@@ -1,13 +1,11 @@
1
- """Utility functions for deploying Kubernetes clusters."""
1
+ """Utility functions for deploying local Kubernetes kind clusters."""
2
2
  import os
3
3
  import random
4
4
  import shlex
5
5
  import subprocess
6
6
  import tempfile
7
7
  import textwrap
8
- from typing import List, Optional, Tuple
9
-
10
- import colorama
8
+ from typing import Optional, Tuple
11
9
 
12
10
  from sky import check as sky_check
13
11
  from sky import sky_logging
@@ -20,7 +18,6 @@ from sky.utils import log_utils
20
18
  from sky.utils import rich_utils
21
19
  from sky.utils import subprocess_utils
22
20
  from sky.utils import ux_utils
23
- from sky.utils.kubernetes import deploy_ssh_node_pools
24
21
 
25
22
  logger = sky_logging.init_logger(__name__)
26
23
 
@@ -32,95 +29,6 @@ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
32
29
  LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
33
30
 
34
31
 
35
- def check_ssh_cluster_dependencies(
36
- raise_error: bool = True) -> Optional[List[str]]:
37
- """Checks if the dependencies for ssh cluster are installed.
38
-
39
- Args:
40
- raise_error: set to true when the dependency needs to be present.
41
- set to false for `sky check`, where reason strings are compiled
42
- at the end.
43
-
44
- Returns: the reasons list if there are missing dependencies.
45
- """
46
- # error message
47
- jq_message = ('`jq` is required to setup ssh cluster.')
48
-
49
- # save
50
- reasons = []
51
- required_binaries = []
52
-
53
- # Ensure jq is installed
54
- try:
55
- subprocess.run(['jq', '--version'],
56
- stdout=subprocess.DEVNULL,
57
- stderr=subprocess.DEVNULL,
58
- check=True)
59
- except (FileNotFoundError, subprocess.CalledProcessError):
60
- required_binaries.append('jq')
61
- reasons.append(jq_message)
62
-
63
- if required_binaries:
64
- reasons.extend([
65
- 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
66
- f' $ sudo apt install {" ".join(required_binaries)}',
67
- 'On MacOS, install with: ',
68
- f' $ brew install {" ".join(required_binaries)}',
69
- ])
70
- if raise_error:
71
- with ux_utils.print_exception_no_traceback():
72
- raise RuntimeError('\n'.join(reasons))
73
- return reasons
74
- return None
75
-
76
-
77
- def deploy_ssh_cluster(cleanup: bool = False,
78
- infra: Optional[str] = None,
79
- kubeconfig_path: Optional[str] = None):
80
- """Deploy a Kubernetes cluster on SSH targets.
81
-
82
- This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
83
- Kubernetes cluster on the specified machines.
84
-
85
- Args:
86
- cleanup: Whether to clean up the cluster instead of deploying.
87
- infra: Name of the cluster in ssh_node_pools.yaml to use.
88
- If None, the first cluster in the file will be used.
89
- kubeconfig_path: Path to save the Kubernetes configuration file.
90
- If None, the default ~/.kube/config will be used.
91
- """
92
- check_ssh_cluster_dependencies()
93
-
94
- action = 'Cleanup' if cleanup else 'Deployment'
95
- msg_str = f'Initializing SSH Node Pools {action}...'
96
-
97
- with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
98
- try:
99
- deploy_ssh_node_pools.deploy_clusters(
100
- infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
101
- except Exception as e: # pylint: disable=broad-except
102
- logger.error(str(e))
103
- with ux_utils.print_exception_no_traceback():
104
- raise RuntimeError(
105
- 'Failed to deploy SkyPilot on some Node Pools.') from e
106
-
107
- logger.info('')
108
- if cleanup:
109
- logger.info(
110
- ux_utils.finishing_message(
111
- '🎉 SSH Node Pools cleaned up successfully.'))
112
- else:
113
- logger.info(
114
- ux_utils.finishing_message(
115
- '🎉 SSH Node Pools set up successfully. ',
116
- follow_up_message=(
117
- f'Run `{colorama.Style.BRIGHT}'
118
- f'sky check ssh'
119
- f'{colorama.Style.RESET_ALL}` to verify access, '
120
- f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
121
- f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
122
-
123
-
124
32
  def generate_kind_config(port_start: int,
125
33
  num_nodes: int = 1,
126
34
  gpus: bool = False) -> str:
@@ -60,4 +60,8 @@ fi
60
60
  # We wrap the command in a bash script that waits for rsync, then execs the original command.
61
61
  # Timeout after MAX_WAIT_TIME_SECONDS seconds.
62
62
  MAX_WAIT_TIME_SECONDS=300
63
- eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
63
+ MAX_WAIT_COUNT=$((MAX_WAIT_TIME_SECONDS * 2))
64
+ # Use --norc --noprofile to prevent bash from sourcing startup files that might
65
+ # output to stdout and corrupt the rsync protocol. All debug output must go to
66
+ # stderr (>&2) to keep stdout clean for rsync communication.
67
+ eval "${kubectl_cmd_base% --} -i -- bash --norc --noprofile -c 'count=0; until which rsync >/dev/null 2>&1; do if [ \$count -ge $MAX_WAIT_COUNT ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""