skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,351 @@
1
+ """Shadeform instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import requests
6
+
7
+ from sky import sky_logging
8
+ from sky.provision import common
9
+ from sky.provision.shadeform import shadeform_utils
10
+ from sky.utils import status_lib
11
+
12
+ POLL_INTERVAL = 10
13
+ INSTANCE_READY_TIMEOUT = 3600
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ # Status mapping from Shadeform to SkyPilot
18
+ SHADEFORM_STATUS_MAP = {
19
+ 'creating': status_lib.ClusterStatus.INIT,
20
+ 'pending_provider': status_lib.ClusterStatus.INIT,
21
+ 'pending': status_lib.ClusterStatus.INIT,
22
+ 'active': status_lib.ClusterStatus.UP,
23
+ 'deleted': status_lib.ClusterStatus.STOPPED,
24
+ }
25
+
26
+
27
+ def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
28
+ """Get all instances belonging to a cluster."""
29
+ try:
30
+ response = shadeform_utils.get_instances()
31
+ instances = response.get('instances', [])
32
+
33
+ cluster_instances = {}
34
+ possible_names = [
35
+ f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
36
+ ]
37
+
38
+ for instance in instances:
39
+ if instance.get('name') in possible_names:
40
+ cluster_instances[instance['id']] = instance
41
+
42
+ return cluster_instances
43
+ except (ValueError, KeyError, requests.exceptions.RequestException) as e:
44
+ logger.warning(f'Failed to get instances: {e}')
45
+ return {}
46
+
47
+
48
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
49
+ """Get the head instance ID from a list of instances."""
50
+ for instance_id, instance in instances.items():
51
+ if instance.get('name', '').endswith('-head'):
52
+ return instance_id
53
+ return None
54
+
55
+
56
+ def _wait_for_instances_ready(cluster_name_on_cloud: str,
57
+ expected_count: int,
58
+ timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
59
+ """Wait for instances to be ready (active state with SSH access)."""
60
+ start_time = time.time()
61
+
62
+ while time.time() - start_time < timeout:
63
+ instances = _get_cluster_instances(cluster_name_on_cloud)
64
+ ready_count = 0
65
+
66
+ for instance in instances.values():
67
+ if (instance.get('status') == 'active' and
68
+ instance.get('ip') is not None and
69
+ instance.get('ssh_port') is not None):
70
+ ready_count += 1
71
+
72
+ logger.info(f'Waiting for instances to be ready: '
73
+ f'({ready_count}/{expected_count})')
74
+
75
+ if ready_count >= expected_count:
76
+ return True
77
+
78
+ time.sleep(POLL_INTERVAL)
79
+
80
+ return False
81
+
82
+
83
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
84
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
85
+ """Run instances for the given cluster."""
86
+ del cluster_name # unused - we use cluster_name_on_cloud
87
+ logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
88
+ f'in region {region}')
89
+ logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
90
+ logger.debug(f'DEBUG: config node_config={config.node_config}')
91
+
92
+ # Check existing instances
93
+ existing_instances = _get_cluster_instances(cluster_name_on_cloud)
94
+ head_instance_id = _get_head_instance_id(existing_instances)
95
+
96
+ # Filter active instances
97
+ active_instances = {
98
+ iid: inst
99
+ for iid, inst in existing_instances.items()
100
+ if inst.get('status') == 'active'
101
+ }
102
+
103
+ current_count = len(active_instances)
104
+ target_count = config.count
105
+
106
+ logger.info(f'Current instances: {current_count}, target: {target_count}')
107
+
108
+ if current_count >= target_count:
109
+ if head_instance_id is None:
110
+ raise RuntimeError(
111
+ f'Cluster {cluster_name_on_cloud} has no head node')
112
+ logger.info(f'Cluster already has {current_count} instances, '
113
+ f'no need to start more')
114
+ return common.ProvisionRecord(
115
+ provider_name='shadeform',
116
+ cluster_name=cluster_name_on_cloud,
117
+ region=region,
118
+ zone=None, # Shadeform doesn't use separate zones
119
+ head_instance_id=head_instance_id,
120
+ resumed_instance_ids=[],
121
+ created_instance_ids=[])
122
+
123
+ # Create new instances
124
+ to_create = target_count - current_count
125
+ created_instance_ids = []
126
+
127
+ for _ in range(to_create):
128
+ node_type = 'head' if head_instance_id is None else 'worker'
129
+ instance_name = f'{cluster_name_on_cloud}-{node_type}'
130
+
131
+ # Extract configuration from node_config
132
+
133
+ # The node_config contains instance specs including InstanceType
134
+ # which follows the format: {cloud_provider}_{instance_type}
135
+ # (e.g., "massedcompute_A6000_basex2")
136
+ node_config = config.node_config
137
+ assert 'InstanceType' in node_config, \
138
+ 'InstanceType must be present in node_config'
139
+
140
+ # Parse the instance type to extract cloud provider and instance specs
141
+ # Expected format: "{cloud}_{instance_type}" where cloud is provider
142
+ # (massedcompute, scaleway, lambda, etc.)
143
+ instance_type_full = node_config['InstanceType']
144
+ assert (isinstance(instance_type_full, str) and
145
+ '_' in instance_type_full), \
146
+ f'InstanceType must be in format cloud_instance_type, got: ' \
147
+ f'{instance_type_full}'
148
+
149
+ instance_type_split = instance_type_full.split('_')
150
+ assert len(instance_type_split) >= 2, \
151
+ f'InstanceType must contain at least one underscore, got: ' \
152
+ f'{instance_type_full}'
153
+
154
+ # Extract cloud provider (first part) and instance type (remaining)
155
+ # Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
156
+ # instance_type="A6000-basex2"
157
+ cloud = instance_type_split[0]
158
+ instance_type = '_'.join(instance_type_split[1:])
159
+
160
+ # Shadeform uses underscores instead of hyphens
161
+ instance_type = instance_type.replace('-', '_')
162
+
163
+ if instance_type.endswith('B'):
164
+ instance_type = instance_type[:-1]
165
+
166
+ # Replace "GBx" with "Gx" (case sensitive)
167
+ if 'GBx' in instance_type:
168
+ instance_type = instance_type.replace('GBx', 'Gx')
169
+
170
+ assert cloud, 'Cloud provider cannot be empty'
171
+ assert instance_type, 'Instance type cannot be empty'
172
+
173
+ # Get SSH key ID for authentication - this is optional and may be None
174
+ ssh_key_id = config.authentication_config.get('ssh_key_id')
175
+
176
+ create_config = {
177
+ 'cloud': cloud,
178
+ 'region': region,
179
+ 'shade_instance_type': instance_type,
180
+ 'name': instance_name,
181
+ 'ssh_key_id': ssh_key_id
182
+ }
183
+
184
+ try:
185
+ logger.info(f'Creating {node_type} instance: {instance_name}')
186
+ response = shadeform_utils.create_instance(create_config)
187
+ instance_id = response['id']
188
+ created_instance_ids.append(instance_id)
189
+
190
+ if head_instance_id is None:
191
+ head_instance_id = instance_id
192
+
193
+ logger.info(f'Created instance {instance_id} ({node_type})')
194
+
195
+ except Exception as e:
196
+ logger.error(f'Failed to create instance: {e}')
197
+ # Clean up any created instances
198
+ for iid in created_instance_ids:
199
+ try:
200
+ shadeform_utils.delete_instance(iid)
201
+ except requests.exceptions.RequestException as cleanup_e:
202
+ logger.warning(
203
+ f'Failed to cleanup instance {iid}: {cleanup_e}')
204
+ raise
205
+
206
+ # Wait for all instances to be ready
207
+ logger.info('Waiting for instances to become ready...')
208
+ if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
209
+ raise RuntimeError('Timed out waiting for instances to be ready')
210
+
211
+ assert head_instance_id is not None, 'head_instance_id should not be None'
212
+
213
+ return common.ProvisionRecord(provider_name='shadeform',
214
+ cluster_name=cluster_name_on_cloud,
215
+ region=region,
216
+ zone=region,
217
+ head_instance_id=head_instance_id,
218
+ resumed_instance_ids=[],
219
+ created_instance_ids=created_instance_ids)
220
+
221
+
222
+ def wait_instances(region: str, cluster_name_on_cloud: str,
223
+ state: Optional[status_lib.ClusterStatus]) -> None:
224
+ """Wait for instances to reach the specified state."""
225
+ del region, cluster_name_on_cloud, state # unused
226
+ # For Shadeform, instances are ready when they reach 'active' status
227
+ # This is already handled in run_instances
228
+
229
+
230
+ def stop_instances(cluster_name_on_cloud: str,
231
+ provider_config: Optional[Dict[str, Any]] = None,
232
+ worker_only: bool = False) -> None:
233
+ """Stop instances (not supported by Shadeform)."""
234
+ del cluster_name_on_cloud, provider_config, worker_only # unused
235
+ raise NotImplementedError(
236
+ 'Stopping instances is not supported by Shadeform')
237
+
238
+
239
+ def terminate_instances(cluster_name_on_cloud: str,
240
+ provider_config: Optional[Dict[str, Any]] = None,
241
+ worker_only: bool = False) -> None:
242
+ """Terminate instances."""
243
+ del provider_config # unused
244
+ logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
245
+
246
+ instances = _get_cluster_instances(cluster_name_on_cloud)
247
+
248
+ if not instances:
249
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
250
+ return
251
+
252
+ instances_to_delete = instances
253
+ if worker_only:
254
+ # Only delete worker nodes, not head
255
+ instances_to_delete = {
256
+ iid: inst
257
+ for iid, inst in instances.items()
258
+ if not inst.get('name', '').endswith('-head')
259
+ }
260
+
261
+ for instance_id, instance in instances_to_delete.items():
262
+ try:
263
+ logger.info(
264
+ f'Terminating instance {instance_id} ({instance.get("name")})')
265
+ shadeform_utils.delete_instance(instance_id)
266
+ except requests.exceptions.RequestException as e:
267
+ logger.warning(f'Failed to terminate instance {instance_id}: {e}')
268
+
269
+
270
+ def get_cluster_info(
271
+ region: str,
272
+ cluster_name_on_cloud: str,
273
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
274
+ """Get cluster information."""
275
+ del region, provider_config # unused
276
+ instances = _get_cluster_instances(cluster_name_on_cloud)
277
+
278
+ if not instances:
279
+ return common.ClusterInfo(instances={},
280
+ head_instance_id=None,
281
+ provider_name='shadeform')
282
+
283
+ head_instance_id = _get_head_instance_id(instances)
284
+
285
+ # Convert instance format for ClusterInfo
286
+ cluster_instances = {}
287
+ for instance_id, instance in instances.items():
288
+ instance_info = common.InstanceInfo(
289
+ instance_id=instance_id,
290
+ internal_ip=instance.get('ip', ''),
291
+ external_ip=instance.get('ip', ''),
292
+ ssh_port=instance.get('ssh_port', 22),
293
+ tags={},
294
+ )
295
+ # ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
296
+ cluster_instances[instance_id] = [instance_info]
297
+
298
+ ssh_user = 'shadeform' # default
299
+ if head_instance_id is not None:
300
+ ssh_user = instances.get(head_instance_id,
301
+ {}).get('ssh_user', 'shadeform')
302
+
303
+ return common.ClusterInfo(instances=cluster_instances,
304
+ head_instance_id=head_instance_id,
305
+ provider_name='shadeform',
306
+ ssh_user=ssh_user)
307
+
308
+
309
+ def query_instances(
310
+ cluster_name: str,
311
+ cluster_name_on_cloud: str,
312
+ provider_config: Optional[Dict[str, Any]] = None,
313
+ non_terminated_only: bool = True,
314
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
315
+ """Query the status of instances."""
316
+ del cluster_name, provider_config # unused
317
+ instances = _get_cluster_instances(cluster_name_on_cloud)
318
+
319
+ if not instances:
320
+ return {}
321
+
322
+ status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
323
+ Optional[str]]] = {}
324
+ for instance_id, instance in instances.items():
325
+ shadeform_status = instance.get('status', 'unknown')
326
+ sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
327
+ status_lib.ClusterStatus.INIT)
328
+
329
+ if (non_terminated_only and
330
+ sky_status == status_lib.ClusterStatus.STOPPED):
331
+ continue
332
+
333
+ status_map[instance_id] = (sky_status, None)
334
+
335
+ return status_map
336
+
337
+
338
+ def open_ports(cluster_name_on_cloud: str,
339
+ ports: List[str],
340
+ provider_config: Optional[Dict[str, Any]] = None) -> None:
341
+ """Open ports (not supported by Shadeform)."""
342
+ del cluster_name_on_cloud, ports, provider_config # unused
343
+ raise NotImplementedError()
344
+
345
+
346
+ def cleanup_ports(cluster_name_on_cloud: str,
347
+ ports: List[str],
348
+ provider_config: Optional[Dict[str, Any]] = None) -> None:
349
+ """Cleanup ports (not supported by Shadeform)."""
350
+ del cluster_name_on_cloud, ports, provider_config # unused
351
+ # Nothing to cleanup since we don't support dynamic port opening
@@ -0,0 +1,83 @@
1
+ """Shadeform API utilities."""
2
+
3
+ import os
4
+ from typing import Any, Dict
5
+
6
+ from sky.adaptors import common
7
+
8
+ # Lazy import to avoid dependency on external packages
9
+ requests = common.LazyImport('requests')
10
+
11
+ # Shadeform API configuration
12
+ SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
13
+ SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
14
+
15
+
16
+ def get_api_key() -> str:
17
+ """Get Shadeform API key from file."""
18
+ api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
19
+ if not os.path.exists(api_key_path):
20
+ raise FileNotFoundError(
21
+ f'Shadeform API key not found at {api_key_path}. '
22
+ 'Please save your API key to this file.')
23
+
24
+ with open(api_key_path, 'r', encoding='utf-8') as f:
25
+ api_key = f.read().strip()
26
+
27
+ if not api_key:
28
+ raise ValueError(f'Shadeform API key is empty in {api_key_path}')
29
+
30
+ return api_key
31
+
32
+
33
+ def make_request(method: str, endpoint: str, **kwargs) -> Any:
34
+ """Make a request to the Shadeform API."""
35
+ url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
36
+ headers = {
37
+ 'X-API-KEY': get_api_key(),
38
+ 'Content-Type': 'application/json',
39
+ }
40
+
41
+ response = requests.request(method, url, headers=headers, **kwargs)
42
+ response.raise_for_status()
43
+
44
+ # Some APIs (like delete) return empty responses with just 200 status
45
+ if response.text.strip():
46
+ return response.json()
47
+ else:
48
+ # Return empty dict for empty responses (e.g., delete operations)
49
+ return {}
50
+
51
+
52
+ def get_instances() -> Dict[str, Any]:
53
+ """Get all instances."""
54
+ return make_request('GET', '/instances')
55
+
56
+
57
+ def get_instance_info(instance_id: str) -> Dict[str, Any]:
58
+ """Get information about a specific instance."""
59
+ return make_request('GET', f'/instances/{instance_id}/info')
60
+
61
+
62
+ def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
63
+ """Create a new instance."""
64
+ return make_request('POST', '/instances/create', json=config)
65
+
66
+
67
+ def delete_instance(instance_id: str) -> Dict[str, Any]:
68
+ """Delete an instance.
69
+
70
+ Note: Shadeform delete API returns empty response with 200 status.
71
+ """
72
+ return make_request('POST', f'/instances/{instance_id}/delete')
73
+
74
+
75
+ def get_ssh_keys() -> Dict[str, Any]:
76
+ """Get all SSH keys."""
77
+ return make_request('GET', '/sshkeys')
78
+
79
+
80
+ def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
81
+ """Add a new SSH key."""
82
+ config = {'name': name, 'public_key': public_key}
83
+ return make_request('POST', '/sshkeys/add', json=config)
@@ -221,9 +221,10 @@ def query_instances(
221
221
  cluster_name_on_cloud: str,
222
222
  provider_config: Optional[Dict[str, Any]] = None,
223
223
  non_terminated_only: bool = True,
224
+ retry_if_missing: bool = False,
224
225
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
225
226
  """See sky/provision/__init__.py"""
226
- del cluster_name # unused
227
+ del cluster_name, retry_if_missing # unused
227
228
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
228
229
  instances = _filter_instances(cluster_name_on_cloud, None)
229
230
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -398,9 +398,10 @@ def query_instances(
398
398
  cluster_name_on_cloud: str,
399
399
  provider_config: Optional[Dict[str, Any]] = None,
400
400
  non_terminated_only: bool = True,
401
+ retry_if_missing: bool = False,
401
402
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
402
403
  """See sky/provision/__init__.py"""
403
- del cluster_name # unused
404
+ del cluster_name, retry_if_missing # unused
404
405
  logger.info('New provision of Vsphere: query_instances().')
405
406
  assert provider_config is not None, cluster_name_on_cloud
406
407
  region = provider_config['region']
sky/resources.py CHANGED
@@ -1104,7 +1104,7 @@ class Resources:
1104
1104
  regions = self.cloud.regions_with_offering(self._instance_type,
1105
1105
  self.accelerators,
1106
1106
  self._use_spot, self._region,
1107
- self._zone)
1107
+ self._zone, self)
1108
1108
  if self._image_id is not None and None not in self._image_id:
1109
1109
  regions = [r for r in regions if r.name in self._image_id]
1110
1110
 
@@ -77,8 +77,13 @@ class APIHealthResponse(ResponseBaseModel):
77
77
  version: str = ''
78
78
  version_on_disk: str = ''
79
79
  commit: str = ''
80
+ # Whether basic auth on api server is enabled
80
81
  basic_auth_enabled: bool = False
81
82
  user: Optional[models.User] = None
83
+ # Whether service account token is enabled
84
+ service_account_token_enabled: bool = False
85
+ # Whether basic auth on ingress is enabled
86
+ ingress_basic_auth_enabled: bool = False
82
87
 
83
88
 
84
89
  class StatusResponse(ResponseBaseModel):
@@ -90,7 +95,7 @@ class StatusResponse(ResponseBaseModel):
90
95
  # This is an internally facing field anyway, so it's less
91
96
  # of a problem that it's not typed.
92
97
  handle: Optional[Any] = None
93
- last_use: str
98
+ last_use: Optional[str] = None
94
99
  status: status_lib.ClusterStatus
95
100
  autostop: int
96
101
  to_down: bool
@@ -98,11 +103,8 @@ class StatusResponse(ResponseBaseModel):
98
103
  # metadata is a JSON, so we use Any here.
99
104
  metadata: Optional[Dict[str, Any]] = None
100
105
  cluster_hash: str
101
- # pydantic cannot generate the pydantic-core schema for
102
- # storage_mounts_metadata, so we use Any here.
103
- storage_mounts_metadata: Optional[Dict[str, Any]] = None
104
106
  cluster_ever_up: bool
105
- status_updated_at: int
107
+ status_updated_at: Optional[int] = None
106
108
  user_hash: str
107
109
  user_name: str
108
110
  config_hash: Optional[str] = None
@@ -160,6 +162,8 @@ class StorageRecord(ResponseBaseModel):
160
162
  # and therefore can be non-optional.
161
163
  class ManagedJobRecord(ResponseBaseModel):
162
164
  """A single managed job record."""
165
+ # The job_id in the spot table
166
+ task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
163
167
  job_id: Optional[int] = None
164
168
  task_id: Optional[int] = None
165
169
  job_name: Optional[str] = None
@@ -0,0 +1,30 @@
1
+ """Initial schema for sky config database
2
+
3
+ Revision ID: 001
4
+ Revises:
5
+ Create Date: 2025-10-21
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from alembic import op
10
+
11
+ from sky.skypilot_config import Base
12
+ from sky.utils.db import db_utils
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = '001'
16
+ down_revision = None
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade():
22
+ """Create initial schema for config_yaml table"""
23
+ with op.get_context().autocommit_block():
24
+ # Create all tables with their current schema
25
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
+
27
+
28
+ def downgrade():
29
+ """Drop all tables"""
30
+ Base.metadata.drop_all(bind=op.get_bind())
@@ -0,0 +1,42 @@
1
+ """Add columns for stored DAG/env file contents.
2
+
3
+ Revision ID: 004
4
+ Revises: 003
5
+ Create Date: 2025-10-27
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '004'
18
+ down_revision: Union[str, Sequence[str], None] = '003'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add columns to persist job file contents in the database."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('job_info',
27
+ 'dag_yaml_content',
28
+ sa.Text(),
29
+ server_default=None)
30
+ db_utils.add_column_to_table_alembic('job_info',
31
+ 'original_user_yaml_content',
32
+ sa.Text(),
33
+ server_default=None)
34
+ db_utils.add_column_to_table_alembic('job_info',
35
+ 'env_file_content',
36
+ sa.Text(),
37
+ server_default=None)
38
+
39
+
40
+ def downgrade():
41
+ """No downgrade logic."""
42
+ pass
@@ -0,0 +1,38 @@
1
+ """Adding columns for the GC time of task logs and controller logs.
2
+
3
+ Revision ID: 005
4
+ Revises: 004
5
+ Create Date: 2025-10-20
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '005'
18
+ down_revision: Union[str, Sequence[str], None] = '004'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add columns for logs gc."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('job_info',
27
+ 'controller_logs_cleaned_at',
28
+ sa.Float(),
29
+ server_default=None)
30
+ db_utils.add_column_to_table_alembic('spot',
31
+ 'logs_cleaned_at',
32
+ sa.Float(),
33
+ server_default=None)
34
+
35
+
36
+ def downgrade():
37
+ """Remove columns for logs gc."""
38
+ pass