skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/shadeform.py +89 -0
- sky/authentication.py +52 -2
- sky/backends/backend_utils.py +35 -25
- sky/backends/cloud_vm_ray_backend.py +5 -5
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +19 -25
- sky/catalog/shadeform_catalog.py +165 -0
- sky/client/cli/command.py +53 -19
- sky/client/sdk.py +13 -1
- sky/clouds/__init__.py +2 -0
- sky/clouds/shadeform.py +393 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +122 -145
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +5 -5
- sky/jobs/state.py +65 -21
- sky/jobs/utils.py +58 -22
- sky/metrics/utils.py +27 -6
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/utils.py +44 -39
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/server/common.py +4 -2
- sky/server/requests/executor.py +25 -3
- sky/server/server.py +9 -3
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +23 -6
- sky/skylet/log_lib.py +0 -1
- sky/skylet/log_lib.pyi +1 -1
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/utils/common.py +2 -0
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +15 -11
- sky/utils/controller_utils.py +35 -8
- sky/utils/locks.py +20 -5
- sky/utils/subprocess_utils.py +4 -3
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
- /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""Shadeform instance provisioning."""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky.provision import common
|
|
9
|
+
from sky.provision.shadeform import shadeform_utils
|
|
10
|
+
from sky.utils import status_lib
|
|
11
|
+
|
|
12
|
+
POLL_INTERVAL = 10
|
|
13
|
+
INSTANCE_READY_TIMEOUT = 3600
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Status mapping from Shadeform to SkyPilot
|
|
18
|
+
SHADEFORM_STATUS_MAP = {
|
|
19
|
+
'creating': status_lib.ClusterStatus.INIT,
|
|
20
|
+
'pending_provider': status_lib.ClusterStatus.INIT,
|
|
21
|
+
'pending': status_lib.ClusterStatus.INIT,
|
|
22
|
+
'active': status_lib.ClusterStatus.UP,
|
|
23
|
+
'deleted': status_lib.ClusterStatus.STOPPED,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
|
|
28
|
+
"""Get all instances belonging to a cluster."""
|
|
29
|
+
try:
|
|
30
|
+
response = shadeform_utils.get_instances()
|
|
31
|
+
instances = response.get('instances', [])
|
|
32
|
+
|
|
33
|
+
cluster_instances = {}
|
|
34
|
+
possible_names = [
|
|
35
|
+
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
for instance in instances:
|
|
39
|
+
if instance.get('name') in possible_names:
|
|
40
|
+
cluster_instances[instance['id']] = instance
|
|
41
|
+
|
|
42
|
+
return cluster_instances
|
|
43
|
+
except (ValueError, KeyError, requests.exceptions.RequestException) as e:
|
|
44
|
+
logger.warning(f'Failed to get instances: {e}')
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
49
|
+
"""Get the head instance ID from a list of instances."""
|
|
50
|
+
for instance_id, instance in instances.items():
|
|
51
|
+
if instance.get('name', '').endswith('-head'):
|
|
52
|
+
return instance_id
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _wait_for_instances_ready(cluster_name_on_cloud: str,
|
|
57
|
+
expected_count: int,
|
|
58
|
+
timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
|
|
59
|
+
"""Wait for instances to be ready (active state with SSH access)."""
|
|
60
|
+
start_time = time.time()
|
|
61
|
+
|
|
62
|
+
while time.time() - start_time < timeout:
|
|
63
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
64
|
+
ready_count = 0
|
|
65
|
+
|
|
66
|
+
for instance in instances.values():
|
|
67
|
+
if (instance.get('status') == 'active' and
|
|
68
|
+
instance.get('ip') is not None and
|
|
69
|
+
instance.get('ssh_port') is not None):
|
|
70
|
+
ready_count += 1
|
|
71
|
+
|
|
72
|
+
logger.info(f'Waiting for instances to be ready: '
|
|
73
|
+
f'({ready_count}/{expected_count})')
|
|
74
|
+
|
|
75
|
+
if ready_count >= expected_count:
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
time.sleep(POLL_INTERVAL)
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
84
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
85
|
+
"""Run instances for the given cluster."""
|
|
86
|
+
del cluster_name # unused - we use cluster_name_on_cloud
|
|
87
|
+
logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
|
|
88
|
+
f'in region {region}')
|
|
89
|
+
logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
|
|
90
|
+
logger.debug(f'DEBUG: config node_config={config.node_config}')
|
|
91
|
+
|
|
92
|
+
# Check existing instances
|
|
93
|
+
existing_instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
94
|
+
head_instance_id = _get_head_instance_id(existing_instances)
|
|
95
|
+
|
|
96
|
+
# Filter active instances
|
|
97
|
+
active_instances = {
|
|
98
|
+
iid: inst
|
|
99
|
+
for iid, inst in existing_instances.items()
|
|
100
|
+
if inst.get('status') == 'active'
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
current_count = len(active_instances)
|
|
104
|
+
target_count = config.count
|
|
105
|
+
|
|
106
|
+
logger.info(f'Current instances: {current_count}, target: {target_count}')
|
|
107
|
+
|
|
108
|
+
if current_count >= target_count:
|
|
109
|
+
if head_instance_id is None:
|
|
110
|
+
raise RuntimeError(
|
|
111
|
+
f'Cluster {cluster_name_on_cloud} has no head node')
|
|
112
|
+
logger.info(f'Cluster already has {current_count} instances, '
|
|
113
|
+
f'no need to start more')
|
|
114
|
+
return common.ProvisionRecord(
|
|
115
|
+
provider_name='shadeform',
|
|
116
|
+
cluster_name=cluster_name_on_cloud,
|
|
117
|
+
region=region,
|
|
118
|
+
zone=None, # Shadeform doesn't use separate zones
|
|
119
|
+
head_instance_id=head_instance_id,
|
|
120
|
+
resumed_instance_ids=[],
|
|
121
|
+
created_instance_ids=[])
|
|
122
|
+
|
|
123
|
+
# Create new instances
|
|
124
|
+
to_create = target_count - current_count
|
|
125
|
+
created_instance_ids = []
|
|
126
|
+
|
|
127
|
+
for _ in range(to_create):
|
|
128
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
|
129
|
+
instance_name = f'{cluster_name_on_cloud}-{node_type}'
|
|
130
|
+
|
|
131
|
+
# Extract configuration from node_config
|
|
132
|
+
|
|
133
|
+
# The node_config contains instance specs including InstanceType
|
|
134
|
+
# which follows the format: {cloud_provider}_{instance_type}
|
|
135
|
+
# (e.g., "massedcompute_A6000_basex2")
|
|
136
|
+
node_config = config.node_config
|
|
137
|
+
assert 'InstanceType' in node_config, \
|
|
138
|
+
'InstanceType must be present in node_config'
|
|
139
|
+
|
|
140
|
+
# Parse the instance type to extract cloud provider and instance specs
|
|
141
|
+
# Expected format: "{cloud}_{instance_type}" where cloud is provider
|
|
142
|
+
# (massedcompute, scaleway, lambda, etc.)
|
|
143
|
+
instance_type_full = node_config['InstanceType']
|
|
144
|
+
assert (isinstance(instance_type_full, str) and
|
|
145
|
+
'_' in instance_type_full), \
|
|
146
|
+
f'InstanceType must be in format cloud_instance_type, got: ' \
|
|
147
|
+
f'{instance_type_full}'
|
|
148
|
+
|
|
149
|
+
instance_type_split = instance_type_full.split('_')
|
|
150
|
+
assert len(instance_type_split) >= 2, \
|
|
151
|
+
f'InstanceType must contain at least one underscore, got: ' \
|
|
152
|
+
f'{instance_type_full}'
|
|
153
|
+
|
|
154
|
+
# Extract cloud provider (first part) and instance type (remaining)
|
|
155
|
+
# Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
|
|
156
|
+
# instance_type="A6000-basex2"
|
|
157
|
+
cloud = instance_type_split[0]
|
|
158
|
+
instance_type = '_'.join(instance_type_split[1:])
|
|
159
|
+
|
|
160
|
+
# Shadeform uses underscores instead of hyphens
|
|
161
|
+
instance_type = instance_type.replace('-', '_')
|
|
162
|
+
|
|
163
|
+
if instance_type.endswith('B'):
|
|
164
|
+
instance_type = instance_type[:-1]
|
|
165
|
+
|
|
166
|
+
# Replace "GBx" with "Gx" (case sensitive)
|
|
167
|
+
if 'GBx' in instance_type:
|
|
168
|
+
instance_type = instance_type.replace('GBx', 'Gx')
|
|
169
|
+
|
|
170
|
+
assert cloud, 'Cloud provider cannot be empty'
|
|
171
|
+
assert instance_type, 'Instance type cannot be empty'
|
|
172
|
+
|
|
173
|
+
# Get SSH key ID for authentication - this is optional and may be None
|
|
174
|
+
ssh_key_id = config.authentication_config.get('ssh_key_id')
|
|
175
|
+
|
|
176
|
+
create_config = {
|
|
177
|
+
'cloud': cloud,
|
|
178
|
+
'region': region,
|
|
179
|
+
'shade_instance_type': instance_type,
|
|
180
|
+
'name': instance_name,
|
|
181
|
+
'ssh_key_id': ssh_key_id
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
logger.info(f'Creating {node_type} instance: {instance_name}')
|
|
186
|
+
response = shadeform_utils.create_instance(create_config)
|
|
187
|
+
instance_id = response['id']
|
|
188
|
+
created_instance_ids.append(instance_id)
|
|
189
|
+
|
|
190
|
+
if head_instance_id is None:
|
|
191
|
+
head_instance_id = instance_id
|
|
192
|
+
|
|
193
|
+
logger.info(f'Created instance {instance_id} ({node_type})')
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f'Failed to create instance: {e}')
|
|
197
|
+
# Clean up any created instances
|
|
198
|
+
for iid in created_instance_ids:
|
|
199
|
+
try:
|
|
200
|
+
shadeform_utils.delete_instance(iid)
|
|
201
|
+
except requests.exceptions.RequestException as cleanup_e:
|
|
202
|
+
logger.warning(
|
|
203
|
+
f'Failed to cleanup instance {iid}: {cleanup_e}')
|
|
204
|
+
raise
|
|
205
|
+
|
|
206
|
+
# Wait for all instances to be ready
|
|
207
|
+
logger.info('Waiting for instances to become ready...')
|
|
208
|
+
if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
|
|
209
|
+
raise RuntimeError('Timed out waiting for instances to be ready')
|
|
210
|
+
|
|
211
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
212
|
+
|
|
213
|
+
return common.ProvisionRecord(provider_name='shadeform',
|
|
214
|
+
cluster_name=cluster_name_on_cloud,
|
|
215
|
+
region=region,
|
|
216
|
+
zone=region,
|
|
217
|
+
head_instance_id=head_instance_id,
|
|
218
|
+
resumed_instance_ids=[],
|
|
219
|
+
created_instance_ids=created_instance_ids)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
|
223
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
|
224
|
+
"""Wait for instances to reach the specified state."""
|
|
225
|
+
del region, cluster_name_on_cloud, state # unused
|
|
226
|
+
# For Shadeform, instances are ready when they reach 'active' status
|
|
227
|
+
# This is already handled in run_instances
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def stop_instances(cluster_name_on_cloud: str,
|
|
231
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
232
|
+
worker_only: bool = False) -> None:
|
|
233
|
+
"""Stop instances (not supported by Shadeform)."""
|
|
234
|
+
del cluster_name_on_cloud, provider_config, worker_only # unused
|
|
235
|
+
raise NotImplementedError(
|
|
236
|
+
'Stopping instances is not supported by Shadeform')
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def terminate_instances(cluster_name_on_cloud: str,
|
|
240
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
241
|
+
worker_only: bool = False) -> None:
|
|
242
|
+
"""Terminate instances."""
|
|
243
|
+
del provider_config # unused
|
|
244
|
+
logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
|
|
245
|
+
|
|
246
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
247
|
+
|
|
248
|
+
if not instances:
|
|
249
|
+
logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
instances_to_delete = instances
|
|
253
|
+
if worker_only:
|
|
254
|
+
# Only delete worker nodes, not head
|
|
255
|
+
instances_to_delete = {
|
|
256
|
+
iid: inst
|
|
257
|
+
for iid, inst in instances.items()
|
|
258
|
+
if not inst.get('name', '').endswith('-head')
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for instance_id, instance in instances_to_delete.items():
|
|
262
|
+
try:
|
|
263
|
+
logger.info(
|
|
264
|
+
f'Terminating instance {instance_id} ({instance.get("name")})')
|
|
265
|
+
shadeform_utils.delete_instance(instance_id)
|
|
266
|
+
except requests.exceptions.RequestException as e:
|
|
267
|
+
logger.warning(f'Failed to terminate instance {instance_id}: {e}')
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_cluster_info(
|
|
271
|
+
region: str,
|
|
272
|
+
cluster_name_on_cloud: str,
|
|
273
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
|
274
|
+
"""Get cluster information."""
|
|
275
|
+
del region, provider_config # unused
|
|
276
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
277
|
+
|
|
278
|
+
if not instances:
|
|
279
|
+
return common.ClusterInfo(instances={},
|
|
280
|
+
head_instance_id=None,
|
|
281
|
+
provider_name='shadeform')
|
|
282
|
+
|
|
283
|
+
head_instance_id = _get_head_instance_id(instances)
|
|
284
|
+
|
|
285
|
+
# Convert instance format for ClusterInfo
|
|
286
|
+
cluster_instances = {}
|
|
287
|
+
for instance_id, instance in instances.items():
|
|
288
|
+
instance_info = common.InstanceInfo(
|
|
289
|
+
instance_id=instance_id,
|
|
290
|
+
internal_ip=instance.get('ip', ''),
|
|
291
|
+
external_ip=instance.get('ip', ''),
|
|
292
|
+
ssh_port=instance.get('ssh_port', 22),
|
|
293
|
+
tags={},
|
|
294
|
+
)
|
|
295
|
+
# ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
|
|
296
|
+
cluster_instances[instance_id] = [instance_info]
|
|
297
|
+
|
|
298
|
+
ssh_user = 'shadeform' # default
|
|
299
|
+
if head_instance_id is not None:
|
|
300
|
+
ssh_user = instances.get(head_instance_id,
|
|
301
|
+
{}).get('ssh_user', 'shadeform')
|
|
302
|
+
|
|
303
|
+
return common.ClusterInfo(instances=cluster_instances,
|
|
304
|
+
head_instance_id=head_instance_id,
|
|
305
|
+
provider_name='shadeform',
|
|
306
|
+
ssh_user=ssh_user)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def query_instances(
|
|
310
|
+
cluster_name: str,
|
|
311
|
+
cluster_name_on_cloud: str,
|
|
312
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
313
|
+
non_terminated_only: bool = True,
|
|
314
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
315
|
+
"""Query the status of instances."""
|
|
316
|
+
del cluster_name, provider_config # unused
|
|
317
|
+
instances = _get_cluster_instances(cluster_name_on_cloud)
|
|
318
|
+
|
|
319
|
+
if not instances:
|
|
320
|
+
return {}
|
|
321
|
+
|
|
322
|
+
status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
323
|
+
Optional[str]]] = {}
|
|
324
|
+
for instance_id, instance in instances.items():
|
|
325
|
+
shadeform_status = instance.get('status', 'unknown')
|
|
326
|
+
sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
|
|
327
|
+
status_lib.ClusterStatus.INIT)
|
|
328
|
+
|
|
329
|
+
if (non_terminated_only and
|
|
330
|
+
sky_status == status_lib.ClusterStatus.STOPPED):
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
status_map[instance_id] = (sky_status, None)
|
|
334
|
+
|
|
335
|
+
return status_map
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def open_ports(cluster_name_on_cloud: str,
|
|
339
|
+
ports: List[str],
|
|
340
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
341
|
+
"""Open ports (not supported by Shadeform)."""
|
|
342
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
343
|
+
raise NotImplementedError()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def cleanup_ports(cluster_name_on_cloud: str,
|
|
347
|
+
ports: List[str],
|
|
348
|
+
provider_config: Optional[Dict[str, Any]] = None) -> None:
|
|
349
|
+
"""Cleanup ports (not supported by Shadeform)."""
|
|
350
|
+
del cluster_name_on_cloud, ports, provider_config # unused
|
|
351
|
+
# Nothing to cleanup since we don't support dynamic port opening
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Shadeform API utilities."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from sky.adaptors import common
|
|
7
|
+
|
|
8
|
+
# Lazy import to avoid dependency on external packages
|
|
9
|
+
requests = common.LazyImport('requests')
|
|
10
|
+
|
|
11
|
+
# Shadeform API configuration
|
|
12
|
+
SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
|
|
13
|
+
SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
"""Get Shadeform API key from file."""
|
|
18
|
+
api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
|
|
19
|
+
if not os.path.exists(api_key_path):
|
|
20
|
+
raise FileNotFoundError(
|
|
21
|
+
f'Shadeform API key not found at {api_key_path}. '
|
|
22
|
+
'Please save your API key to this file.')
|
|
23
|
+
|
|
24
|
+
with open(api_key_path, 'r', encoding='utf-8') as f:
|
|
25
|
+
api_key = f.read().strip()
|
|
26
|
+
|
|
27
|
+
if not api_key:
|
|
28
|
+
raise ValueError(f'Shadeform API key is empty in {api_key_path}')
|
|
29
|
+
|
|
30
|
+
return api_key
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def make_request(method: str, endpoint: str, **kwargs) -> Any:
|
|
34
|
+
"""Make a request to the Shadeform API."""
|
|
35
|
+
url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
|
|
36
|
+
headers = {
|
|
37
|
+
'X-API-KEY': get_api_key(),
|
|
38
|
+
'Content-Type': 'application/json',
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
response = requests.request(method, url, headers=headers, **kwargs)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
|
|
44
|
+
# Some APIs (like delete) return empty responses with just 200 status
|
|
45
|
+
if response.text.strip():
|
|
46
|
+
return response.json()
|
|
47
|
+
else:
|
|
48
|
+
# Return empty dict for empty responses (e.g., delete operations)
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_instances() -> Dict[str, Any]:
|
|
53
|
+
"""Get all instances."""
|
|
54
|
+
return make_request('GET', '/instances')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_info(instance_id: str) -> Dict[str, Any]:
|
|
58
|
+
"""Get information about a specific instance."""
|
|
59
|
+
return make_request('GET', f'/instances/{instance_id}/info')
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
63
|
+
"""Create a new instance."""
|
|
64
|
+
return make_request('POST', '/instances/create', json=config)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def delete_instance(instance_id: str) -> Dict[str, Any]:
|
|
68
|
+
"""Delete an instance.
|
|
69
|
+
|
|
70
|
+
Note: Shadeform delete API returns empty response with 200 status.
|
|
71
|
+
"""
|
|
72
|
+
return make_request('POST', f'/instances/{instance_id}/delete')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_ssh_keys() -> Dict[str, Any]:
|
|
76
|
+
"""Get all SSH keys."""
|
|
77
|
+
return make_request('GET', '/sshkeys')
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
|
|
81
|
+
"""Add a new SSH key."""
|
|
82
|
+
config = {'name': name, 'public_key': public_key}
|
|
83
|
+
return make_request('POST', '/sshkeys/add', json=config)
|
sky/server/common.py
CHANGED
|
@@ -554,8 +554,8 @@ def _start_api_server(deploy: bool = False,
|
|
|
554
554
|
# pylint: disable=import-outside-toplevel
|
|
555
555
|
import sky.jobs.utils as job_utils
|
|
556
556
|
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
557
|
-
if job_utils.is_consolidation_mode()
|
|
558
|
-
server_constants.MIN_AVAIL_MEM_GB)
|
|
557
|
+
if job_utils.is_consolidation_mode(on_api_restart=True)
|
|
558
|
+
else server_constants.MIN_AVAIL_MEM_GB)
|
|
559
559
|
if avail_mem_size_gb <= max_memory:
|
|
560
560
|
logger.warning(
|
|
561
561
|
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
|
|
@@ -571,6 +571,8 @@ def _start_api_server(deploy: bool = False,
|
|
|
571
571
|
args += [f'--host={host}']
|
|
572
572
|
if metrics_port is not None:
|
|
573
573
|
args += [f'--metrics-port={metrics_port}']
|
|
574
|
+
# Use this argument to disable the internal signal file check.
|
|
575
|
+
args += ['--start-with-python']
|
|
574
576
|
|
|
575
577
|
if foreground:
|
|
576
578
|
# Replaces the current process with the API server
|
sky/server/requests/executor.py
CHANGED
|
@@ -81,6 +81,26 @@ logger = sky_logging.init_logger(__name__)
|
|
|
81
81
|
# platforms, including macOS.
|
|
82
82
|
multiprocessing.set_start_method('spawn', force=True)
|
|
83
83
|
|
|
84
|
+
# Max threads that is equivalent to the number of thread workers in the
|
|
85
|
+
# default thread pool executor of event loop.
|
|
86
|
+
_REQUEST_THREADS_LIMIT = min(32, (os.cpu_count() or 0) + 4)
|
|
87
|
+
|
|
88
|
+
_REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
|
|
89
|
+
# A dedicated thread pool executor for synced requests execution in coroutine
|
|
90
|
+
_REQUEST_THREAD_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_request_thread_executor() -> concurrent.futures.ThreadPoolExecutor:
|
|
94
|
+
"""Lazy init and return the request thread executor for current process."""
|
|
95
|
+
global _REQUEST_THREAD_EXECUTOR
|
|
96
|
+
if _REQUEST_THREAD_EXECUTOR is not None:
|
|
97
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
98
|
+
with _REQUEST_THREAD_EXECUTOR_LOCK:
|
|
99
|
+
if _REQUEST_THREAD_EXECUTOR is None:
|
|
100
|
+
_REQUEST_THREAD_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
|
|
101
|
+
max_workers=_REQUEST_THREADS_LIMIT)
|
|
102
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
103
|
+
|
|
84
104
|
|
|
85
105
|
class RequestQueue:
|
|
86
106
|
"""The queue for the requests, either redis or multiprocessing.
|
|
@@ -404,6 +424,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
404
424
|
os.close(original_stderr)
|
|
405
425
|
original_stderr = None
|
|
406
426
|
|
|
427
|
+
request_name = None
|
|
407
428
|
try:
|
|
408
429
|
# As soon as the request is updated with the executor PID, we can
|
|
409
430
|
# receive SIGTERM from cancellation. So, we update the request inside
|
|
@@ -495,7 +516,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
495
516
|
annotations.clear_request_level_cache()
|
|
496
517
|
with metrics_utils.time_it(name='release_memory', group='internal'):
|
|
497
518
|
common_utils.release_memory()
|
|
498
|
-
|
|
519
|
+
if request_name is not None:
|
|
520
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
499
521
|
except Exception as e: # pylint: disable=broad-except
|
|
500
522
|
logger.error(f'Failed to record memory metrics: '
|
|
501
523
|
f'{common_utils.format_exception(e)}')
|
|
@@ -576,8 +598,8 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
576
598
|
# 1. skypilot config is not contextual
|
|
577
599
|
# 2. envs that read directly from os.environ are not contextual
|
|
578
600
|
ctx.override_envs(request_body.env_vars)
|
|
579
|
-
fut: asyncio.Future = context_utils.
|
|
580
|
-
|
|
601
|
+
fut: asyncio.Future = context_utils.to_thread_with_executor(
|
|
602
|
+
get_request_thread_executor(), func, **request_body.to_kwargs())
|
|
581
603
|
|
|
582
604
|
async def poll_task(request_id: str) -> bool:
|
|
583
605
|
req_status = await api_requests.get_request_status_async(request_id)
|
sky/server/server.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import argparse
|
|
4
4
|
import asyncio
|
|
5
5
|
import base64
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
import contextlib
|
|
7
8
|
import datetime
|
|
8
9
|
import hashlib
|
|
@@ -1731,9 +1732,9 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1731
1732
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1732
1733
|
|
|
1733
1734
|
# Run core.status in another thread to avoid blocking the event loop.
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1735
|
+
with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
|
|
1736
|
+
cluster_records = await context_utils.to_thread_with_executor(
|
|
1737
|
+
thread_pool_executor, core.status, cluster_name, all_users=True)
|
|
1737
1738
|
cluster_record = cluster_records[0]
|
|
1738
1739
|
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
|
1739
1740
|
raise fastapi.HTTPException(
|
|
@@ -1967,6 +1968,7 @@ if __name__ == '__main__':
|
|
|
1967
1968
|
# Serve metrics on a separate port to isolate it from the application APIs:
|
|
1968
1969
|
# metrics port will not be exposed to the public network typically.
|
|
1969
1970
|
parser.add_argument('--metrics-port', default=9090, type=int)
|
|
1971
|
+
parser.add_argument('--start-with-python', action='store_true')
|
|
1970
1972
|
cmd_args = parser.parse_args()
|
|
1971
1973
|
if cmd_args.port == cmd_args.metrics_port:
|
|
1972
1974
|
logger.error('port and metrics-port cannot be the same, exiting.')
|
|
@@ -1981,6 +1983,10 @@ if __name__ == '__main__':
|
|
|
1981
1983
|
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
1982
1984
|
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
1983
1985
|
|
|
1986
|
+
if not cmd_args.start_with_python:
|
|
1987
|
+
# Maybe touch the signal file on API server startup.
|
|
1988
|
+
managed_job_utils.is_consolidation_mode(on_api_restart=True)
|
|
1989
|
+
|
|
1984
1990
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
1985
1991
|
# that it is shown only when the API server is started.
|
|
1986
1992
|
usage_lib.maybe_show_privacy_policy()
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -222,6 +222,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
222
222
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
223
223
|
'seeweb': ['ecsapi>=0.2.0'],
|
|
224
224
|
'server': server_dependencies,
|
|
225
|
+
'shadeform': [], # No dependencies needed for shadeform
|
|
225
226
|
}
|
|
226
227
|
|
|
227
228
|
# Calculate which clouds should be included in the [all] installation.
|
sky/sky_logging.py
CHANGED
|
@@ -109,7 +109,6 @@ def _setup_logger():
|
|
|
109
109
|
global _default_handler
|
|
110
110
|
if _default_handler is None:
|
|
111
111
|
_default_handler = EnvAwareHandler(sys.stdout)
|
|
112
|
-
_default_handler.flush = sys.stdout.flush # type: ignore
|
|
113
112
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
114
113
|
_default_handler.setLevel(logging.DEBUG)
|
|
115
114
|
else:
|
|
@@ -129,7 +128,6 @@ def _setup_logger():
|
|
|
129
128
|
for logger_name in _SENSITIVE_LOGGER:
|
|
130
129
|
logger = logging.getLogger(logger_name)
|
|
131
130
|
handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
|
|
132
|
-
handler_to_logger.flush = sys.stdout.flush # type: ignore
|
|
133
131
|
logger.addHandler(handler_to_logger)
|
|
134
132
|
logger.setLevel(logging.INFO)
|
|
135
133
|
if _show_logging_prefix():
|
sky/skylet/constants.py
CHANGED
|
@@ -226,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
226
226
|
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
|
227
227
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
|
228
228
|
f'|| {RAY_STATUS} || '
|
|
229
|
-
|
|
229
|
+
# The pydantic-core==2.41.3 for arm seems corrupted
|
|
230
|
+
# so we need to avoid that specific version.
|
|
231
|
+
f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
|
|
230
232
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
|
231
233
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
|
232
234
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
|
@@ -402,10 +404,25 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
402
404
|
]
|
|
403
405
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
404
406
|
# we skip the following keys because they are meant to be client-side configs.
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
407
|
+
# Also, we skip the consolidation mode config as those should be only set on
|
|
408
|
+
# the API server side.
|
|
409
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
410
|
+
('api_server',),
|
|
411
|
+
('allowed_clouds',),
|
|
412
|
+
('workspaces',),
|
|
413
|
+
('db',),
|
|
414
|
+
('daemons',),
|
|
415
|
+
# TODO(kevin,tian): Override the whole controller config once our test
|
|
416
|
+
# infrastructure supports setting dynamic server side configs.
|
|
417
|
+
# Tests that are affected:
|
|
418
|
+
# - test_managed_jobs_ha_kill_starting
|
|
419
|
+
# - test_managed_jobs_ha_kill_running
|
|
420
|
+
# - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
|
|
421
|
+
# LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
|
|
422
|
+
# but the configs won't be applied)
|
|
423
|
+
('jobs', 'controller', 'consolidation_mode'),
|
|
424
|
+
('serve', 'controller', 'consolidation_mode'),
|
|
425
|
+
]
|
|
409
426
|
|
|
410
427
|
# Constants for Azure blob storage
|
|
411
428
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
|
@@ -471,7 +488,7 @@ CATALOG_DIR = '~/.sky/catalogs'
|
|
|
471
488
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
472
489
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
473
490
|
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
|
|
474
|
-
'hyperbolic', 'seeweb')
|
|
491
|
+
'hyperbolic', 'seeweb', 'shadeform')
|
|
475
492
|
# END constants used for service catalog.
|
|
476
493
|
|
|
477
494
|
# The user ID of the SkyPilot system.
|
sky/skylet/log_lib.py
CHANGED
|
@@ -271,7 +271,6 @@ def run_with_log(
|
|
|
271
271
|
stdout, stderr = context_utils.pipe_and_wait_process(
|
|
272
272
|
ctx,
|
|
273
273
|
proc,
|
|
274
|
-
cancel_callback=subprocess_utils.kill_children_processes,
|
|
275
274
|
stdout_stream_handler=stdout_stream_handler,
|
|
276
275
|
stderr_stream_handler=stderr_stream_handler)
|
|
277
276
|
elif process_stream:
|