skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/provision/nebius/utils.py
CHANGED
|
@@ -36,8 +36,10 @@ def retry(func):
|
|
|
36
36
|
|
|
37
37
|
def get_project_by_region(region: str) -> str:
|
|
38
38
|
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
|
39
|
-
projects =
|
|
40
|
-
|
|
39
|
+
projects = nebius.sync_call(
|
|
40
|
+
service.list(
|
|
41
|
+
nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
|
|
42
|
+
timeout=nebius.READ_TIMEOUT))
|
|
41
43
|
|
|
42
44
|
# Check is there project if in config
|
|
43
45
|
project_id = skypilot_config.get_effective_region_config(
|
|
@@ -56,19 +58,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
|
|
|
56
58
|
"""
|
|
57
59
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
58
60
|
try:
|
|
59
|
-
cluster =
|
|
60
|
-
|
|
61
|
-
name=name,
|
|
62
|
-
)).wait()
|
|
63
|
-
cluster_id = cluster.metadata.id
|
|
64
|
-
except nebius.request_error():
|
|
65
|
-
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
|
66
|
-
metadata=nebius.nebius_common().ResourceMetadata(
|
|
61
|
+
cluster = nebius.sync_call(
|
|
62
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
67
63
|
parent_id=project_id,
|
|
68
64
|
name=name,
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
65
|
+
)))
|
|
66
|
+
cluster_id = cluster.metadata.id
|
|
67
|
+
except nebius.request_error():
|
|
68
|
+
cluster = nebius.sync_call(
|
|
69
|
+
service.create(nebius.compute().CreateGpuClusterRequest(
|
|
70
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
71
|
+
parent_id=project_id,
|
|
72
|
+
name=name,
|
|
73
|
+
),
|
|
74
|
+
spec=nebius.compute().GpuClusterSpec(
|
|
75
|
+
infiniband_fabric=fabric))))
|
|
72
76
|
cluster_id = cluster.resource_id
|
|
73
77
|
return cluster_id
|
|
74
78
|
|
|
@@ -78,14 +82,16 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
78
82
|
project_id = get_project_by_region(region)
|
|
79
83
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
80
84
|
try:
|
|
81
|
-
cluster =
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
cluster = nebius.sync_call(
|
|
86
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
87
|
+
parent_id=project_id,
|
|
88
|
+
name=name,
|
|
89
|
+
)))
|
|
85
90
|
cluster_id = cluster.metadata.id
|
|
86
91
|
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
nebius.sync_call(
|
|
93
|
+
service.delete(
|
|
94
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
|
|
89
95
|
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
|
90
96
|
except nebius.request_error():
|
|
91
97
|
logger.debug('GPU Cluster does not exist.')
|
|
@@ -94,8 +100,10 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
94
100
|
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
95
101
|
"""Lists instances associated with API key."""
|
|
96
102
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
97
|
-
result =
|
|
98
|
-
|
|
103
|
+
result = nebius.sync_call(
|
|
104
|
+
service.list(
|
|
105
|
+
nebius.compute().ListInstancesRequest(parent_id=project_id),
|
|
106
|
+
timeout=nebius.READ_TIMEOUT))
|
|
99
107
|
|
|
100
108
|
instances = result
|
|
101
109
|
|
|
@@ -116,12 +124,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
|
116
124
|
|
|
117
125
|
def stop(instance_id: str) -> None:
|
|
118
126
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
119
|
-
|
|
127
|
+
nebius.sync_call(
|
|
128
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
|
|
120
129
|
retry_count = 0
|
|
121
130
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
|
122
131
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
123
|
-
instance =
|
|
124
|
-
id=instance_id,))
|
|
132
|
+
instance = nebius.sync_call(
|
|
133
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
125
134
|
if instance.status.state.name == 'STOPPED':
|
|
126
135
|
break
|
|
127
136
|
time.sleep(POLL_INTERVAL)
|
|
@@ -138,12 +147,13 @@ def stop(instance_id: str) -> None:
|
|
|
138
147
|
|
|
139
148
|
def start(instance_id: str) -> None:
|
|
140
149
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
141
|
-
|
|
150
|
+
nebius.sync_call(
|
|
151
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
|
|
142
152
|
retry_count = 0
|
|
143
153
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
|
144
154
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
145
|
-
instance =
|
|
146
|
-
id=instance_id,))
|
|
155
|
+
instance = nebius.sync_call(
|
|
156
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
147
157
|
if instance.status.state.name == 'RUNNING':
|
|
148
158
|
break
|
|
149
159
|
time.sleep(POLL_INTERVAL)
|
|
@@ -212,24 +222,26 @@ def launch(cluster_name_on_cloud: str,
|
|
|
212
222
|
project_id, fabric)
|
|
213
223
|
|
|
214
224
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
215
|
-
disk =
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
225
|
+
disk = nebius.sync_call(
|
|
226
|
+
service.create(nebius.compute().CreateDiskRequest(
|
|
227
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
228
|
+
parent_id=project_id,
|
|
229
|
+
name=disk_name,
|
|
230
|
+
),
|
|
231
|
+
spec=nebius.compute().DiskSpec(
|
|
232
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
|
233
|
+
image_family=image_family),
|
|
234
|
+
size_gibibytes=disk_size,
|
|
235
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
|
236
|
+
))))
|
|
226
237
|
disk_id = disk.resource_id
|
|
227
238
|
retry_count = 0
|
|
228
239
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
|
229
|
-
disk =
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
240
|
+
disk = nebius.sync_call(
|
|
241
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
242
|
+
parent_id=project_id,
|
|
243
|
+
name=disk_name,
|
|
244
|
+
)))
|
|
233
245
|
if disk.status.state.name == 'READY':
|
|
234
246
|
break
|
|
235
247
|
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
|
@@ -254,50 +266,53 @@ def launch(cluster_name_on_cloud: str,
|
|
|
254
266
|
id=fs['filesystem_id'])))
|
|
255
267
|
|
|
256
268
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
|
257
|
-
sub_net =
|
|
258
|
-
parent_id=project_id,))
|
|
269
|
+
sub_net = nebius.sync_call(
|
|
270
|
+
service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
|
|
259
271
|
|
|
260
272
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
).AttachedDiskSpec
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
273
|
+
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
|
|
274
|
+
nebius.sync_call(
|
|
275
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
|
276
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
277
|
+
parent_id=project_id,
|
|
278
|
+
name=instance_name,
|
|
279
|
+
),
|
|
280
|
+
spec=nebius.compute().InstanceSpec(
|
|
281
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
|
|
282
|
+
id=cluster_id,) if cluster_id is not None else None,
|
|
283
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
|
284
|
+
attach_mode=nebius.compute(
|
|
285
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
286
|
+
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
|
|
287
|
+
cloud_init_user_data=user_data,
|
|
288
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
|
289
|
+
preset=preset),
|
|
290
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
|
291
|
+
network_interfaces=[
|
|
292
|
+
nebius.compute().NetworkInterfaceSpec(
|
|
293
|
+
subnet_id=sub_net.items[0].metadata.id,
|
|
294
|
+
ip_address=nebius.compute().IPAddress(),
|
|
295
|
+
name='network-interface-0',
|
|
296
|
+
public_ip_address=nebius.compute().PublicIPAddress()
|
|
297
|
+
if associate_public_ip_address else None,
|
|
298
|
+
)
|
|
299
|
+
],
|
|
300
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
301
|
+
if use_spot else None,
|
|
302
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
303
|
+
priority=1,
|
|
304
|
+
on_preemption=nebius.compute().PreemptibleSpec.
|
|
305
|
+
PreemptionPolicy.STOP) if use_spot else None,
|
|
306
|
+
))))
|
|
293
307
|
instance_id = ''
|
|
294
308
|
retry_count = 0
|
|
295
309
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
296
310
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
297
|
-
instance =
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
311
|
+
instance = nebius.sync_call(
|
|
312
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
313
|
+
parent_id=project_id,
|
|
314
|
+
name=instance_name,
|
|
315
|
+
)))
|
|
301
316
|
if instance.status.state.name == 'STARTING':
|
|
302
317
|
instance_id = instance.metadata.id
|
|
303
318
|
break
|
|
@@ -317,19 +332,19 @@ def launch(cluster_name_on_cloud: str,
|
|
|
317
332
|
def remove(instance_id: str) -> None:
|
|
318
333
|
"""Terminates the given instance."""
|
|
319
334
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
320
|
-
result =
|
|
321
|
-
nebius.compute().GetInstanceRequest(id=instance_id))
|
|
335
|
+
result = nebius.sync_call(
|
|
336
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
|
|
322
337
|
disk_id = result.spec.boot_disk.existing_disk.id
|
|
323
|
-
|
|
324
|
-
nebius.compute().DeleteInstanceRequest(id=instance_id))
|
|
338
|
+
nebius.sync_call(
|
|
339
|
+
service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
|
|
325
340
|
retry_count = 0
|
|
326
341
|
# The instance begins deleting and attempts to delete the disk.
|
|
327
342
|
# Must wait until the disk is unlocked and becomes deletable.
|
|
328
343
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
|
329
344
|
try:
|
|
330
345
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
331
|
-
|
|
332
|
-
nebius.compute().DeleteDiskRequest(id=disk_id))
|
|
346
|
+
nebius.sync_call(
|
|
347
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
333
348
|
break
|
|
334
349
|
except nebius.request_error():
|
|
335
350
|
logger.debug('Waiting for disk deletion.')
|
sky/provision/oci/instance.py
CHANGED
|
@@ -32,6 +32,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
32
32
|
@query_utils.debug_enabled(logger)
|
|
33
33
|
@common_utils.retry
|
|
34
34
|
def query_instances(
|
|
35
|
+
cluster_name: str,
|
|
35
36
|
cluster_name_on_cloud: str,
|
|
36
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
38
|
non_terminated_only: bool = True,
|
|
@@ -43,6 +44,7 @@ def query_instances(
|
|
|
43
44
|
A None status means the instance is marked as "terminated"
|
|
44
45
|
or "terminating".
|
|
45
46
|
"""
|
|
47
|
+
del cluster_name # unusedå
|
|
46
48
|
assert provider_config is not None, cluster_name_on_cloud
|
|
47
49
|
region = provider_config['region']
|
|
48
50
|
|
|
@@ -277,12 +277,13 @@ def get_cluster_info(
|
|
|
277
277
|
|
|
278
278
|
|
|
279
279
|
def query_instances(
|
|
280
|
+
cluster_name: str,
|
|
280
281
|
cluster_name_on_cloud: str,
|
|
281
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
283
|
non_terminated_only: bool = True,
|
|
283
284
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
285
|
"""See sky/provision/__init__.py"""
|
|
285
|
-
del non_terminated_only
|
|
286
|
+
del cluster_name, non_terminated_only #unused
|
|
286
287
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
287
288
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
288
289
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import common as adaptors_common
|
|
11
|
-
|
|
11
|
+
from sky.provision.paperspace import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
sky/provision/provisioner.py
CHANGED
|
@@ -76,7 +76,8 @@ def _bulk_provision(
|
|
|
76
76
|
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
|
77
77
|
rich_utils.force_update_status(
|
|
78
78
|
ux_utils.spinner_message('Launching - Checking instance status',
|
|
79
|
-
str(provision_logging.config.log_path)
|
|
79
|
+
str(provision_logging.config.log_path),
|
|
80
|
+
cluster_name=str(cluster_name)))
|
|
80
81
|
# AWS would take a very short time (<<1s) updating the state of the
|
|
81
82
|
# instance.
|
|
82
83
|
time.sleep(1)
|
|
@@ -462,9 +463,9 @@ def _post_provision_setup(
|
|
|
462
463
|
docker_config = config_from_yaml.get('docker', {})
|
|
463
464
|
|
|
464
465
|
with rich_utils.safe_status(
|
|
465
|
-
ux_utils.spinner_message(
|
|
466
|
-
|
|
467
|
-
|
|
466
|
+
ux_utils.spinner_message('Launching - Waiting for SSH access',
|
|
467
|
+
provision_logging.config.log_path,
|
|
468
|
+
cluster_name=str(cluster_name))) as status:
|
|
468
469
|
# If on Kubernetes, skip SSH check since the pods are guaranteed to be
|
|
469
470
|
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
|
470
471
|
# commands and rsync on the pods. SSH will still be ready after a while
|
|
@@ -493,7 +494,8 @@ def _post_provision_setup(
|
|
|
493
494
|
status.update(
|
|
494
495
|
ux_utils.spinner_message(
|
|
495
496
|
'Launching - Initializing docker container',
|
|
496
|
-
provision_logging.config.log_path
|
|
497
|
+
provision_logging.config.log_path,
|
|
498
|
+
cluster_name=str(cluster_name)))
|
|
497
499
|
docker_user = instance_setup.initialize_docker(
|
|
498
500
|
cluster_name.name_on_cloud,
|
|
499
501
|
docker_config=docker_config,
|
|
@@ -541,7 +543,8 @@ def _post_provision_setup(
|
|
|
541
543
|
|
|
542
544
|
runtime_preparation_str = (ux_utils.spinner_message(
|
|
543
545
|
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
|
544
|
-
provision_logging.config.log_path
|
|
546
|
+
provision_logging.config.log_path,
|
|
547
|
+
cluster_name=str(cluster_name)))
|
|
545
548
|
status.update(
|
|
546
549
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
|
547
550
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
|
@@ -679,7 +682,8 @@ def _post_provision_setup(
|
|
|
679
682
|
if logging_agent:
|
|
680
683
|
status.update(
|
|
681
684
|
ux_utils.spinner_message('Setting up logging agent',
|
|
682
|
-
provision_logging.config.log_path
|
|
685
|
+
provision_logging.config.log_path,
|
|
686
|
+
cluster_name=str(cluster_name)))
|
|
683
687
|
instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
|
|
684
688
|
cluster_info,
|
|
685
689
|
ssh_credentials)
|
|
@@ -689,7 +693,8 @@ def _post_provision_setup(
|
|
|
689
693
|
|
|
690
694
|
logger.info(
|
|
691
695
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
692
|
-
provision_logging.config.log_path
|
|
696
|
+
provision_logging.config.log_path,
|
|
697
|
+
cluster_name=str(cluster_name)))
|
|
693
698
|
return cluster_info
|
|
694
699
|
|
|
695
700
|
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -201,11 +201,13 @@ def get_cluster_info(
|
|
|
201
201
|
|
|
202
202
|
|
|
203
203
|
def query_instances(
|
|
204
|
+
cluster_name: str,
|
|
204
205
|
cluster_name_on_cloud: str,
|
|
205
206
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
207
|
non_terminated_only: bool = True,
|
|
207
208
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
209
|
"""See sky/provision/__init__.py"""
|
|
210
|
+
del cluster_name # unused
|
|
209
211
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
212
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
211
213
|
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.adaptors import runpod
|
|
9
9
|
from sky.provision import docker_utils
|
|
10
|
-
|
|
10
|
+
from sky.provision.runpod.api import commands as runpod_commands
|
|
11
11
|
from sky.skylet import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
sky/provision/scp/instance.py
CHANGED
|
@@ -427,10 +427,12 @@ def terminate_instances(
|
|
|
427
427
|
|
|
428
428
|
|
|
429
429
|
def query_instances(
|
|
430
|
+
cluster_name: str,
|
|
430
431
|
cluster_name_on_cloud: str,
|
|
431
432
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
432
433
|
non_terminated_only: bool = True,
|
|
433
434
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
|
+
del cluster_name # unused
|
|
434
436
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
435
437
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
436
438
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -216,11 +216,13 @@ def open_ports(
|
|
|
216
216
|
|
|
217
217
|
|
|
218
218
|
def query_instances(
|
|
219
|
+
cluster_name: str,
|
|
219
220
|
cluster_name_on_cloud: str,
|
|
220
221
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
221
222
|
non_terminated_only: bool = True,
|
|
222
223
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
223
224
|
"""See sky/provision/__init__.py"""
|
|
225
|
+
del cluster_name # unused
|
|
224
226
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
225
227
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
226
228
|
# "running", "frozen", "stopped", "unknown", "loading"
|
|
@@ -393,11 +393,13 @@ def _get_cluster_name_filter(cluster_name_on_cloud):
|
|
|
393
393
|
|
|
394
394
|
|
|
395
395
|
def query_instances(
|
|
396
|
+
cluster_name: str,
|
|
396
397
|
cluster_name_on_cloud: str,
|
|
397
398
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
398
399
|
non_terminated_only: bool = True,
|
|
399
400
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
400
401
|
"""See sky/provision/__init__.py"""
|
|
402
|
+
del cluster_name # unused
|
|
401
403
|
logger.info('New provision of Vsphere: query_instances().')
|
|
402
404
|
assert provider_config is not None, cluster_name_on_cloud
|
|
403
405
|
region = provider_config['region']
|
sky/resources.py
CHANGED
|
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
import colorama
|
|
10
10
|
|
|
11
|
-
import sky
|
|
12
11
|
from sky import catalog
|
|
13
12
|
from sky import check as sky_check
|
|
14
13
|
from sky import clouds
|
|
@@ -38,7 +37,7 @@ if typing.TYPE_CHECKING:
|
|
|
38
37
|
|
|
39
38
|
logger = sky_logging.init_logger(__name__)
|
|
40
39
|
|
|
41
|
-
|
|
40
|
+
DEFAULT_DISK_SIZE_GB = 256
|
|
42
41
|
|
|
43
42
|
RESOURCE_CONFIG_ALIASES = {
|
|
44
43
|
'gpus': 'accelerators',
|
|
@@ -288,7 +287,7 @@ class Resources:
|
|
|
288
287
|
if infra is not None:
|
|
289
288
|
infra_info = infra_utils.InfraInfo.from_str(infra)
|
|
290
289
|
# Infra takes precedence over individually specified parameters
|
|
291
|
-
cloud =
|
|
290
|
+
cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
|
|
292
291
|
region = infra_info.region
|
|
293
292
|
zone = infra_info.zone
|
|
294
293
|
|
|
@@ -320,7 +319,7 @@ class Resources:
|
|
|
320
319
|
self._disk_size = int(
|
|
321
320
|
resources_utils.parse_memory_resource(disk_size, 'disk_size'))
|
|
322
321
|
else:
|
|
323
|
-
self._disk_size =
|
|
322
|
+
self._disk_size = DEFAULT_DISK_SIZE_GB
|
|
324
323
|
|
|
325
324
|
self._image_id: Optional[Dict[Optional[str], str]] = None
|
|
326
325
|
if isinstance(image_id, str):
|
|
@@ -483,7 +482,7 @@ class Resources:
|
|
|
483
482
|
network_tier = f', network_tier={self.network_tier.value}'
|
|
484
483
|
|
|
485
484
|
disk_size = ''
|
|
486
|
-
if self.disk_size !=
|
|
485
|
+
if self.disk_size != DEFAULT_DISK_SIZE_GB:
|
|
487
486
|
disk_size = f', disk_size={self.disk_size}'
|
|
488
487
|
|
|
489
488
|
ports = ''
|
|
@@ -1767,7 +1766,7 @@ class Resources:
|
|
|
1767
1766
|
self._accelerators is None,
|
|
1768
1767
|
self._accelerator_args is None,
|
|
1769
1768
|
not self._use_spot_specified,
|
|
1770
|
-
self._disk_size ==
|
|
1769
|
+
self._disk_size == DEFAULT_DISK_SIZE_GB,
|
|
1771
1770
|
self._disk_tier is None,
|
|
1772
1771
|
self._network_tier is None,
|
|
1773
1772
|
self._image_id is None,
|
|
@@ -2256,7 +2255,7 @@ class Resources:
|
|
|
2256
2255
|
accelerator_args = state.pop('accelerator_args', None)
|
|
2257
2256
|
state['_accelerator_args'] = accelerator_args
|
|
2258
2257
|
|
|
2259
|
-
disk_size = state.pop('disk_size',
|
|
2258
|
+
disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
|
|
2260
2259
|
state['_disk_size'] = disk_size
|
|
2261
2260
|
|
|
2262
2261
|
if version < 2:
|
sky/schemas/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Responses for the API server."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky import models
|
|
8
|
+
from sky.server import common
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ResponseBaseModel(pydantic.BaseModel):
|
|
12
|
+
"""A pydantic model that acts like a dict.
|
|
13
|
+
|
|
14
|
+
Supports the following syntax:
|
|
15
|
+
class SampleResponse(DictLikePayload):
|
|
16
|
+
field: str
|
|
17
|
+
|
|
18
|
+
response = SampleResponse(field='value')
|
|
19
|
+
print(response['field']) # prints 'value'
|
|
20
|
+
response['field'] = 'value2'
|
|
21
|
+
print(response['field']) # prints 'value2'
|
|
22
|
+
print('field' in response) # prints True
|
|
23
|
+
|
|
24
|
+
This model exists for backwards compatibility with the
|
|
25
|
+
old SDK that used to return a dict.
|
|
26
|
+
|
|
27
|
+
The backward compatibility may be removed
|
|
28
|
+
in the future.
|
|
29
|
+
"""
|
|
30
|
+
# Ignore extra fields in the request body, which is useful for backward
|
|
31
|
+
# compatibility. The difference with `allow` is that `ignore` will not
|
|
32
|
+
# include the unknown fields when dump the model, i.e., we can add new
|
|
33
|
+
# fields to the request body without breaking the existing old API server
|
|
34
|
+
# where the handler function does not accept the new field in function
|
|
35
|
+
# signature.
|
|
36
|
+
model_config = pydantic.ConfigDict(extra='ignore')
|
|
37
|
+
|
|
38
|
+
# backward compatibility with dict
|
|
39
|
+
# TODO(syang): remove this in v0.13.0
|
|
40
|
+
def __getitem__(self, key):
|
|
41
|
+
try:
|
|
42
|
+
return getattr(self, key)
|
|
43
|
+
except AttributeError as e:
|
|
44
|
+
raise KeyError(key) from e
|
|
45
|
+
|
|
46
|
+
def __setitem__(self, key, value):
|
|
47
|
+
setattr(self, key, value)
|
|
48
|
+
|
|
49
|
+
def __contains__(self, key):
|
|
50
|
+
return hasattr(self, key)
|
|
51
|
+
|
|
52
|
+
def keys(self):
|
|
53
|
+
return self.model_dump().keys()
|
|
54
|
+
|
|
55
|
+
def values(self):
|
|
56
|
+
return self.model_dump().values()
|
|
57
|
+
|
|
58
|
+
def items(self):
|
|
59
|
+
return self.model_dump().items()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class APIHealthResponse(ResponseBaseModel):
|
|
63
|
+
"""Response for the API health endpoint."""
|
|
64
|
+
status: common.ApiServerStatus
|
|
65
|
+
api_version: str = ''
|
|
66
|
+
version: str = ''
|
|
67
|
+
version_on_disk: str = ''
|
|
68
|
+
commit: str = ''
|
|
69
|
+
basic_auth_enabled: bool = False
|
|
70
|
+
user: Optional[models.User] = None
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Add provision_log_path to clusters and cluster_history.
|
|
2
|
+
|
|
3
|
+
Revision ID: 006
|
|
4
|
+
Revises: 005
|
|
5
|
+
Create Date: 2025-08-12
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '006'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '005'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add provision_log_path columns."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
# clusters.provision_log_path
|
|
27
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
28
|
+
'provision_log_path',
|
|
29
|
+
sa.Text(),
|
|
30
|
+
server_default=None)
|
|
31
|
+
|
|
32
|
+
# cluster_history.provision_log_path
|
|
33
|
+
db_utils.add_column_to_table_alembic('cluster_history',
|
|
34
|
+
'provision_log_path',
|
|
35
|
+
sa.Text(),
|
|
36
|
+
server_default=None)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def downgrade():
|
|
40
|
+
"""No-op for backward compatibility."""
|
|
41
|
+
pass
|
|
File without changes
|