skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend_utils.py +74 -7
  4. sky/backends/cloud_vm_ray_backend.py +169 -29
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +62 -85
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +69 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +15 -5
  14. sky/clouds/nebius.py +3 -1
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  23. sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  25. sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
  27. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  29. sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  34. sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
  36. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  37. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
  54. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  55. sky/dashboard/out/clusters/[cluster].html +1 -1
  56. sky/dashboard/out/clusters.html +1 -1
  57. sky/dashboard/out/config.html +1 -1
  58. sky/dashboard/out/index.html +1 -1
  59. sky/dashboard/out/infra/[context].html +1 -1
  60. sky/dashboard/out/infra.html +1 -1
  61. sky/dashboard/out/jobs/[job].html +1 -1
  62. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -1
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage.py +11 -1
  70. sky/exceptions.py +5 -0
  71. sky/execution.py +13 -10
  72. sky/global_user_state.py +191 -8
  73. sky/jobs/constants.py +1 -1
  74. sky/jobs/controller.py +0 -1
  75. sky/jobs/recovery_strategy.py +3 -3
  76. sky/jobs/scheduler.py +35 -87
  77. sky/jobs/server/core.py +82 -22
  78. sky/jobs/server/utils.py +1 -1
  79. sky/jobs/state.py +7 -5
  80. sky/jobs/utils.py +167 -8
  81. sky/provision/__init__.py +1 -0
  82. sky/provision/aws/config.py +25 -0
  83. sky/provision/aws/instance.py +37 -13
  84. sky/provision/azure/instance.py +2 -0
  85. sky/provision/cudo/cudo_wrapper.py +1 -1
  86. sky/provision/cudo/instance.py +2 -0
  87. sky/provision/do/instance.py +2 -0
  88. sky/provision/fluidstack/instance.py +2 -0
  89. sky/provision/gcp/instance.py +2 -0
  90. sky/provision/hyperbolic/instance.py +2 -1
  91. sky/provision/kubernetes/instance.py +133 -0
  92. sky/provision/lambda_cloud/instance.py +2 -0
  93. sky/provision/nebius/instance.py +2 -0
  94. sky/provision/nebius/utils.py +101 -86
  95. sky/provision/oci/instance.py +2 -0
  96. sky/provision/paperspace/instance.py +2 -1
  97. sky/provision/paperspace/utils.py +1 -1
  98. sky/provision/provisioner.py +13 -8
  99. sky/provision/runpod/instance.py +2 -0
  100. sky/provision/runpod/utils.py +1 -1
  101. sky/provision/scp/instance.py +2 -0
  102. sky/provision/vast/instance.py +2 -0
  103. sky/provision/vsphere/instance.py +2 -0
  104. sky/resources.py +6 -7
  105. sky/schemas/__init__.py +0 -0
  106. sky/schemas/api/__init__.py +0 -0
  107. sky/schemas/api/responses.py +70 -0
  108. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  109. sky/schemas/generated/__init__.py +0 -0
  110. sky/schemas/generated/autostopv1_pb2.py +36 -0
  111. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  112. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  113. sky/serve/constants.py +3 -7
  114. sky/serve/replica_managers.py +138 -117
  115. sky/serve/serve_state.py +42 -0
  116. sky/serve/serve_utils.py +58 -36
  117. sky/serve/server/impl.py +15 -19
  118. sky/serve/service.py +82 -33
  119. sky/server/constants.py +1 -1
  120. sky/server/requests/payloads.py +6 -0
  121. sky/server/requests/serializers/decoders.py +12 -2
  122. sky/server/requests/serializers/encoders.py +10 -2
  123. sky/server/server.py +64 -16
  124. sky/setup_files/dependencies.py +11 -10
  125. sky/skylet/autostop_lib.py +38 -5
  126. sky/skylet/constants.py +3 -1
  127. sky/skylet/services.py +44 -0
  128. sky/skylet/skylet.py +49 -4
  129. sky/task.py +19 -16
  130. sky/templates/aws-ray.yml.j2 +2 -2
  131. sky/templates/jobs-controller.yaml.j2 +6 -0
  132. sky/templates/kubernetes-ray.yml.j2 +1 -0
  133. sky/utils/command_runner.py +1 -1
  134. sky/utils/common_utils.py +20 -0
  135. sky/utils/config_utils.py +29 -5
  136. sky/utils/controller_utils.py +86 -0
  137. sky/utils/db/db_utils.py +17 -0
  138. sky/utils/db/migration_utils.py +1 -1
  139. sky/utils/log_utils.py +14 -5
  140. sky/utils/resources_utils.py +25 -1
  141. sky/utils/schemas.py +6 -0
  142. sky/utils/ux_utils.py +36 -5
  143. sky/volumes/server/core.py +2 -2
  144. sky/volumes/server/server.py +2 -2
  145. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
  146. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
  147. sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  149. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  150. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  151. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  155. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  156. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  158. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  160. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  161. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  164. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  166. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  169. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  170. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
  175. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
  176. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
  177. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
  178. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
  179. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
@@ -36,8 +36,10 @@ def retry(func):
36
36
 
37
37
  def get_project_by_region(region: str) -> str:
38
38
  service = nebius.iam().ProjectServiceClient(nebius.sdk())
39
- projects = service.list(nebius.iam().ListProjectsRequest(
40
- parent_id=nebius.get_tenant_id())).wait()
39
+ projects = nebius.sync_call(
40
+ service.list(
41
+ nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
42
+ timeout=nebius.READ_TIMEOUT))
41
43
 
42
44
  # Check is there project if in config
43
45
  project_id = skypilot_config.get_effective_region_config(
@@ -56,19 +58,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
56
58
  """
57
59
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
58
60
  try:
59
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
60
- parent_id=project_id,
61
- name=name,
62
- )).wait()
63
- cluster_id = cluster.metadata.id
64
- except nebius.request_error():
65
- cluster = service.create(nebius.compute().CreateGpuClusterRequest(
66
- metadata=nebius.nebius_common().ResourceMetadata(
61
+ cluster = nebius.sync_call(
62
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
67
63
  parent_id=project_id,
68
64
  name=name,
69
- ),
70
- spec=nebius.compute().GpuClusterSpec(
71
- infiniband_fabric=fabric))).wait()
65
+ )))
66
+ cluster_id = cluster.metadata.id
67
+ except nebius.request_error():
68
+ cluster = nebius.sync_call(
69
+ service.create(nebius.compute().CreateGpuClusterRequest(
70
+ metadata=nebius.nebius_common().ResourceMetadata(
71
+ parent_id=project_id,
72
+ name=name,
73
+ ),
74
+ spec=nebius.compute().GpuClusterSpec(
75
+ infiniband_fabric=fabric))))
72
76
  cluster_id = cluster.resource_id
73
77
  return cluster_id
74
78
 
@@ -78,14 +82,16 @@ def delete_cluster(name: str, region: str) -> None:
78
82
  project_id = get_project_by_region(region)
79
83
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
80
84
  try:
81
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
82
- parent_id=project_id,
83
- name=name,
84
- )).wait()
85
+ cluster = nebius.sync_call(
86
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
87
+ parent_id=project_id,
88
+ name=name,
89
+ )))
85
90
  cluster_id = cluster.metadata.id
86
91
  logger.debug(f'Found GPU Cluster : {cluster_id}.')
87
- service.delete(
88
- nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
92
+ nebius.sync_call(
93
+ service.delete(
94
+ nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
89
95
  logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
90
96
  except nebius.request_error():
91
97
  logger.debug('GPU Cluster does not exist.')
@@ -94,8 +100,10 @@ def delete_cluster(name: str, region: str) -> None:
94
100
  def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
95
101
  """Lists instances associated with API key."""
96
102
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
97
- result = service.list(
98
- nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
103
+ result = nebius.sync_call(
104
+ service.list(
105
+ nebius.compute().ListInstancesRequest(parent_id=project_id),
106
+ timeout=nebius.READ_TIMEOUT))
99
107
 
100
108
  instances = result
101
109
 
@@ -116,12 +124,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
116
124
 
117
125
  def stop(instance_id: str) -> None:
118
126
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
119
- service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
127
+ nebius.sync_call(
128
+ service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
120
129
  retry_count = 0
121
130
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
122
131
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
123
- instance = service.get(nebius.compute().GetInstanceRequest(
124
- id=instance_id,)).wait()
132
+ instance = nebius.sync_call(
133
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
125
134
  if instance.status.state.name == 'STOPPED':
126
135
  break
127
136
  time.sleep(POLL_INTERVAL)
@@ -138,12 +147,13 @@ def stop(instance_id: str) -> None:
138
147
 
139
148
  def start(instance_id: str) -> None:
140
149
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
141
- service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
150
+ nebius.sync_call(
151
+ service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
142
152
  retry_count = 0
143
153
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
144
154
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
145
- instance = service.get(nebius.compute().GetInstanceRequest(
146
- id=instance_id,)).wait()
155
+ instance = nebius.sync_call(
156
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
147
157
  if instance.status.state.name == 'RUNNING':
148
158
  break
149
159
  time.sleep(POLL_INTERVAL)
@@ -212,24 +222,26 @@ def launch(cluster_name_on_cloud: str,
212
222
  project_id, fabric)
213
223
 
214
224
  service = nebius.compute().DiskServiceClient(nebius.sdk())
215
- disk = service.create(nebius.compute().CreateDiskRequest(
216
- metadata=nebius.nebius_common().ResourceMetadata(
217
- parent_id=project_id,
218
- name=disk_name,
219
- ),
220
- spec=nebius.compute().DiskSpec(
221
- source_image_family=nebius.compute().SourceImageFamily(
222
- image_family=image_family),
223
- size_gibibytes=disk_size,
224
- type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
225
- ))).wait()
225
+ disk = nebius.sync_call(
226
+ service.create(nebius.compute().CreateDiskRequest(
227
+ metadata=nebius.nebius_common().ResourceMetadata(
228
+ parent_id=project_id,
229
+ name=disk_name,
230
+ ),
231
+ spec=nebius.compute().DiskSpec(
232
+ source_image_family=nebius.compute().SourceImageFamily(
233
+ image_family=image_family),
234
+ size_gibibytes=disk_size,
235
+ type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
236
+ ))))
226
237
  disk_id = disk.resource_id
227
238
  retry_count = 0
228
239
  while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
229
- disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
230
- parent_id=project_id,
231
- name=disk_name,
232
- )).wait()
240
+ disk = nebius.sync_call(
241
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
242
+ parent_id=project_id,
243
+ name=disk_name,
244
+ )))
233
245
  if disk.status.state.name == 'READY':
234
246
  break
235
247
  logger.debug(f'Waiting for disk {disk_name} to be ready.')
@@ -254,50 +266,53 @@ def launch(cluster_name_on_cloud: str,
254
266
  id=fs['filesystem_id'])))
255
267
 
256
268
  service = nebius.vpc().SubnetServiceClient(nebius.sdk())
257
- sub_net = service.list(nebius.vpc().ListSubnetsRequest(
258
- parent_id=project_id,)).wait()
269
+ sub_net = nebius.sync_call(
270
+ service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
259
271
 
260
272
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
261
- service.create(nebius.compute().CreateInstanceRequest(
262
- metadata=nebius.nebius_common().ResourceMetadata(
263
- parent_id=project_id,
264
- name=instance_name,
265
- ),
266
- spec=nebius.compute().InstanceSpec(
267
- gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
268
- if cluster_id is not None else None,
269
- boot_disk=nebius.compute().AttachedDiskSpec(
270
- attach_mode=nebius.compute(
271
- ).AttachedDiskSpec.AttachMode.READ_WRITE,
272
- existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
273
- cloud_init_user_data=user_data,
274
- resources=nebius.compute().ResourcesSpec(platform=platform,
275
- preset=preset),
276
- filesystems=filesystems_spec if filesystems_spec else None,
277
- network_interfaces=[
278
- nebius.compute().NetworkInterfaceSpec(
279
- subnet_id=sub_net.items[0].metadata.id,
280
- ip_address=nebius.compute().IPAddress(),
281
- name='network-interface-0',
282
- public_ip_address=nebius.compute().PublicIPAddress()
283
- if associate_public_ip_address else None,
284
- )
285
- ],
286
- recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
287
- if use_spot else None,
288
- preemptible=nebius.compute().PreemptibleSpec(
289
- priority=1,
290
- on_preemption=nebius.compute(
291
- ).PreemptibleSpec.PreemptionPolicy.STOP) if use_spot else None,
292
- ))).wait()
273
+ logger.debug(f'Creating instance {instance_name} in project {project_id}.')
274
+ nebius.sync_call(
275
+ service.create(nebius.compute().CreateInstanceRequest(
276
+ metadata=nebius.nebius_common().ResourceMetadata(
277
+ parent_id=project_id,
278
+ name=instance_name,
279
+ ),
280
+ spec=nebius.compute().InstanceSpec(
281
+ gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
282
+ id=cluster_id,) if cluster_id is not None else None,
283
+ boot_disk=nebius.compute().AttachedDiskSpec(
284
+ attach_mode=nebius.compute(
285
+ ).AttachedDiskSpec.AttachMode.READ_WRITE,
286
+ existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
287
+ cloud_init_user_data=user_data,
288
+ resources=nebius.compute().ResourcesSpec(platform=platform,
289
+ preset=preset),
290
+ filesystems=filesystems_spec if filesystems_spec else None,
291
+ network_interfaces=[
292
+ nebius.compute().NetworkInterfaceSpec(
293
+ subnet_id=sub_net.items[0].metadata.id,
294
+ ip_address=nebius.compute().IPAddress(),
295
+ name='network-interface-0',
296
+ public_ip_address=nebius.compute().PublicIPAddress()
297
+ if associate_public_ip_address else None,
298
+ )
299
+ ],
300
+ recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
301
+ if use_spot else None,
302
+ preemptible=nebius.compute().PreemptibleSpec(
303
+ priority=1,
304
+ on_preemption=nebius.compute().PreemptibleSpec.
305
+ PreemptionPolicy.STOP) if use_spot else None,
306
+ ))))
293
307
  instance_id = ''
294
308
  retry_count = 0
295
309
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
296
310
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
297
- instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
298
- parent_id=project_id,
299
- name=instance_name,
300
- )).wait()
311
+ instance = nebius.sync_call(
312
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
313
+ parent_id=project_id,
314
+ name=instance_name,
315
+ )))
301
316
  if instance.status.state.name == 'STARTING':
302
317
  instance_id = instance.metadata.id
303
318
  break
@@ -317,19 +332,19 @@ def launch(cluster_name_on_cloud: str,
317
332
  def remove(instance_id: str) -> None:
318
333
  """Terminates the given instance."""
319
334
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
320
- result = service.get(
321
- nebius.compute().GetInstanceRequest(id=instance_id)).wait()
335
+ result = nebius.sync_call(
336
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
322
337
  disk_id = result.spec.boot_disk.existing_disk.id
323
- service.delete(
324
- nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
338
+ nebius.sync_call(
339
+ service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
325
340
  retry_count = 0
326
341
  # The instance begins deleting and attempts to delete the disk.
327
342
  # Must wait until the disk is unlocked and becomes deletable.
328
343
  while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
329
344
  try:
330
345
  service = nebius.compute().DiskServiceClient(nebius.sdk())
331
- service.delete(
332
- nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
346
+ nebius.sync_call(
347
+ service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
333
348
  break
334
349
  except nebius.request_error():
335
350
  logger.debug('Waiting for disk deletion.')
@@ -32,6 +32,7 @@ logger = sky_logging.init_logger(__name__)
32
32
  @query_utils.debug_enabled(logger)
33
33
  @common_utils.retry
34
34
  def query_instances(
35
+ cluster_name: str,
35
36
  cluster_name_on_cloud: str,
36
37
  provider_config: Optional[Dict[str, Any]] = None,
37
38
  non_terminated_only: bool = True,
@@ -43,6 +44,7 @@ def query_instances(
43
44
  A None status means the instance is marked as "terminated"
44
45
  or "terminating".
45
46
  """
47
+ del cluster_name # unusedå
46
48
  assert provider_config is not None, cluster_name_on_cloud
47
49
  region = provider_config['region']
48
50
 
@@ -277,12 +277,13 @@ def get_cluster_info(
277
277
 
278
278
 
279
279
  def query_instances(
280
+ cluster_name: str,
280
281
  cluster_name_on_cloud: str,
281
282
  provider_config: Optional[Dict[str, Any]] = None,
282
283
  non_terminated_only: bool = True,
283
284
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
285
  """See sky/provision/__init__.py"""
285
- del non_terminated_only
286
+ del cluster_name, non_terminated_only #unused
286
287
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
287
288
  instances = _filter_instances(cluster_name_on_cloud, None)
288
289
 
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import common as adaptors_common
11
- import sky.provision.paperspace.constants as constants
11
+ from sky.provision.paperspace import constants
12
12
  from sky.utils import common_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -76,7 +76,8 @@ def _bulk_provision(
76
76
  logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
77
77
  rich_utils.force_update_status(
78
78
  ux_utils.spinner_message('Launching - Checking instance status',
79
- str(provision_logging.config.log_path)))
79
+ str(provision_logging.config.log_path),
80
+ cluster_name=str(cluster_name)))
80
81
  # AWS would take a very short time (<<1s) updating the state of the
81
82
  # instance.
82
83
  time.sleep(1)
@@ -462,9 +463,9 @@ def _post_provision_setup(
462
463
  docker_config = config_from_yaml.get('docker', {})
463
464
 
464
465
  with rich_utils.safe_status(
465
- ux_utils.spinner_message(
466
- 'Launching - Waiting for SSH access',
467
- provision_logging.config.log_path)) as status:
466
+ ux_utils.spinner_message('Launching - Waiting for SSH access',
467
+ provision_logging.config.log_path,
468
+ cluster_name=str(cluster_name))) as status:
468
469
  # If on Kubernetes, skip SSH check since the pods are guaranteed to be
469
470
  # ready by the provisioner, and we use kubectl instead of SSH to run the
470
471
  # commands and rsync on the pods. SSH will still be ready after a while
@@ -493,7 +494,8 @@ def _post_provision_setup(
493
494
  status.update(
494
495
  ux_utils.spinner_message(
495
496
  'Launching - Initializing docker container',
496
- provision_logging.config.log_path))
497
+ provision_logging.config.log_path,
498
+ cluster_name=str(cluster_name)))
497
499
  docker_user = instance_setup.initialize_docker(
498
500
  cluster_name.name_on_cloud,
499
501
  docker_config=docker_config,
@@ -541,7 +543,8 @@ def _post_provision_setup(
541
543
 
542
544
  runtime_preparation_str = (ux_utils.spinner_message(
543
545
  'Preparing SkyPilot runtime ({step}/3 - {step_name})',
544
- provision_logging.config.log_path))
546
+ provision_logging.config.log_path,
547
+ cluster_name=str(cluster_name)))
545
548
  status.update(
546
549
  runtime_preparation_str.format(step=1, step_name='initializing'))
547
550
  instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
@@ -679,7 +682,8 @@ def _post_provision_setup(
679
682
  if logging_agent:
680
683
  status.update(
681
684
  ux_utils.spinner_message('Setting up logging agent',
682
- provision_logging.config.log_path))
685
+ provision_logging.config.log_path,
686
+ cluster_name=str(cluster_name)))
683
687
  instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
684
688
  cluster_info,
685
689
  ssh_credentials)
@@ -689,7 +693,8 @@ def _post_provision_setup(
689
693
 
690
694
  logger.info(
691
695
  ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
692
- provision_logging.config.log_path))
696
+ provision_logging.config.log_path,
697
+ cluster_name=str(cluster_name)))
693
698
  return cluster_info
694
699
 
695
700
 
@@ -201,11 +201,13 @@ def get_cluster_info(
201
201
 
202
202
 
203
203
  def query_instances(
204
+ cluster_name: str,
204
205
  cluster_name_on_cloud: str,
205
206
  provider_config: Optional[Dict[str, Any]] = None,
206
207
  non_terminated_only: bool = True,
207
208
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
209
  """See sky/provision/__init__.py"""
210
+ del cluster_name # unused
209
211
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
212
  instances = _filter_instances(cluster_name_on_cloud, None)
211
213
 
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
7
  from sky import sky_logging
8
8
  from sky.adaptors import runpod
9
9
  from sky.provision import docker_utils
10
- import sky.provision.runpod.api.commands as runpod_commands
10
+ from sky.provision.runpod.api import commands as runpod_commands
11
11
  from sky.skylet import constants
12
12
  from sky.utils import common_utils
13
13
 
@@ -427,10 +427,12 @@ def terminate_instances(
427
427
 
428
428
 
429
429
  def query_instances(
430
+ cluster_name: str,
430
431
  cluster_name_on_cloud: str,
431
432
  provider_config: Optional[Dict[str, Any]] = None,
432
433
  non_terminated_only: bool = True,
433
434
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
+ del cluster_name # unused
434
436
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
435
437
  instances = _filter_instances(cluster_name_on_cloud, None)
436
438
 
@@ -216,11 +216,13 @@ def open_ports(
216
216
 
217
217
 
218
218
  def query_instances(
219
+ cluster_name: str,
219
220
  cluster_name_on_cloud: str,
220
221
  provider_config: Optional[Dict[str, Any]] = None,
221
222
  non_terminated_only: bool = True,
222
223
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
223
224
  """See sky/provision/__init__.py"""
225
+ del cluster_name # unused
224
226
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
225
227
  instances = _filter_instances(cluster_name_on_cloud, None)
226
228
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -393,11 +393,13 @@ def _get_cluster_name_filter(cluster_name_on_cloud):
393
393
 
394
394
 
395
395
  def query_instances(
396
+ cluster_name: str,
396
397
  cluster_name_on_cloud: str,
397
398
  provider_config: Optional[Dict[str, Any]] = None,
398
399
  non_terminated_only: bool = True,
399
400
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
400
401
  """See sky/provision/__init__.py"""
402
+ del cluster_name # unused
401
403
  logger.info('New provision of Vsphere: query_instances().')
402
404
  assert provider_config is not None, cluster_name_on_cloud
403
405
  region = provider_config['region']
sky/resources.py CHANGED
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
8
8
 
9
9
  import colorama
10
10
 
11
- import sky
12
11
  from sky import catalog
13
12
  from sky import check as sky_check
14
13
  from sky import clouds
@@ -38,7 +37,7 @@ if typing.TYPE_CHECKING:
38
37
 
39
38
  logger = sky_logging.init_logger(__name__)
40
39
 
41
- _DEFAULT_DISK_SIZE_GB = 256
40
+ DEFAULT_DISK_SIZE_GB = 256
42
41
 
43
42
  RESOURCE_CONFIG_ALIASES = {
44
43
  'gpus': 'accelerators',
@@ -288,7 +287,7 @@ class Resources:
288
287
  if infra is not None:
289
288
  infra_info = infra_utils.InfraInfo.from_str(infra)
290
289
  # Infra takes precedence over individually specified parameters
291
- cloud = sky.CLOUD_REGISTRY.from_str(infra_info.cloud)
290
+ cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
292
291
  region = infra_info.region
293
292
  zone = infra_info.zone
294
293
 
@@ -320,7 +319,7 @@ class Resources:
320
319
  self._disk_size = int(
321
320
  resources_utils.parse_memory_resource(disk_size, 'disk_size'))
322
321
  else:
323
- self._disk_size = _DEFAULT_DISK_SIZE_GB
322
+ self._disk_size = DEFAULT_DISK_SIZE_GB
324
323
 
325
324
  self._image_id: Optional[Dict[Optional[str], str]] = None
326
325
  if isinstance(image_id, str):
@@ -483,7 +482,7 @@ class Resources:
483
482
  network_tier = f', network_tier={self.network_tier.value}'
484
483
 
485
484
  disk_size = ''
486
- if self.disk_size != _DEFAULT_DISK_SIZE_GB:
485
+ if self.disk_size != DEFAULT_DISK_SIZE_GB:
487
486
  disk_size = f', disk_size={self.disk_size}'
488
487
 
489
488
  ports = ''
@@ -1767,7 +1766,7 @@ class Resources:
1767
1766
  self._accelerators is None,
1768
1767
  self._accelerator_args is None,
1769
1768
  not self._use_spot_specified,
1770
- self._disk_size == _DEFAULT_DISK_SIZE_GB,
1769
+ self._disk_size == DEFAULT_DISK_SIZE_GB,
1771
1770
  self._disk_tier is None,
1772
1771
  self._network_tier is None,
1773
1772
  self._image_id is None,
@@ -2256,7 +2255,7 @@ class Resources:
2256
2255
  accelerator_args = state.pop('accelerator_args', None)
2257
2256
  state['_accelerator_args'] = accelerator_args
2258
2257
 
2259
- disk_size = state.pop('disk_size', _DEFAULT_DISK_SIZE_GB)
2258
+ disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
2260
2259
  state['_disk_size'] = disk_size
2261
2260
 
2262
2261
  if version < 2:
File without changes
File without changes
@@ -0,0 +1,70 @@
1
+ """Responses for the API server."""
2
+
3
+ from typing import Optional
4
+
5
+ import pydantic
6
+
7
+ from sky import models
8
+ from sky.server import common
9
+
10
+
11
+ class ResponseBaseModel(pydantic.BaseModel):
12
+ """A pydantic model that acts like a dict.
13
+
14
+ Supports the following syntax:
15
+ class SampleResponse(DictLikePayload):
16
+ field: str
17
+
18
+ response = SampleResponse(field='value')
19
+ print(response['field']) # prints 'value'
20
+ response['field'] = 'value2'
21
+ print(response['field']) # prints 'value2'
22
+ print('field' in response) # prints True
23
+
24
+ This model exists for backwards compatibility with the
25
+ old SDK that used to return a dict.
26
+
27
+ The backward compatibility may be removed
28
+ in the future.
29
+ """
30
+ # Ignore extra fields in the request body, which is useful for backward
31
+ # compatibility. The difference with `allow` is that `ignore` will not
32
+ # include the unknown fields when dump the model, i.e., we can add new
33
+ # fields to the request body without breaking the existing old API server
34
+ # where the handler function does not accept the new field in function
35
+ # signature.
36
+ model_config = pydantic.ConfigDict(extra='ignore')
37
+
38
+ # backward compatibility with dict
39
+ # TODO(syang): remove this in v0.13.0
40
+ def __getitem__(self, key):
41
+ try:
42
+ return getattr(self, key)
43
+ except AttributeError as e:
44
+ raise KeyError(key) from e
45
+
46
+ def __setitem__(self, key, value):
47
+ setattr(self, key, value)
48
+
49
+ def __contains__(self, key):
50
+ return hasattr(self, key)
51
+
52
+ def keys(self):
53
+ return self.model_dump().keys()
54
+
55
+ def values(self):
56
+ return self.model_dump().values()
57
+
58
+ def items(self):
59
+ return self.model_dump().items()
60
+
61
+
62
+ class APIHealthResponse(ResponseBaseModel):
63
+ """Response for the API health endpoint."""
64
+ status: common.ApiServerStatus
65
+ api_version: str = ''
66
+ version: str = ''
67
+ version_on_disk: str = ''
68
+ commit: str = ''
69
+ basic_auth_enabled: bool = False
70
+ user: Optional[models.User] = None
@@ -0,0 +1,41 @@
1
+ """Add provision_log_path to clusters and cluster_history.
2
+
3
+ Revision ID: 006
4
+ Revises: 005
5
+ Create Date: 2025-08-12
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '006'
18
+ down_revision: Union[str, Sequence[str], None] = '005'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add provision_log_path columns."""
25
+ with op.get_context().autocommit_block():
26
+ # clusters.provision_log_path
27
+ db_utils.add_column_to_table_alembic('clusters',
28
+ 'provision_log_path',
29
+ sa.Text(),
30
+ server_default=None)
31
+
32
+ # cluster_history.provision_log_path
33
+ db_utils.add_column_to_table_alembic('cluster_history',
34
+ 'provision_log_path',
35
+ sa.Text(),
36
+ server_default=None)
37
+
38
+
39
+ def downgrade():
40
+ """No-op for backward compatibility."""
41
+ pass
File without changes