skypilot-nightly 1.0.0.dev20251005__py3-none-any.whl → 1.0.0.dev20251008__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +17 -21
- sky/backends/backend.py +1 -3
- sky/backends/cloud_vm_ray_backend.py +8 -20
- sky/backends/local_docker_backend.py +0 -5
- sky/client/sdk.py +24 -23
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +1 -11
- sky/global_user_state.py +16 -5
- sky/jobs/constants.py +1 -7
- sky/jobs/controller.py +9 -1
- sky/jobs/scheduler.py +30 -15
- sky/jobs/server/core.py +8 -3
- sky/jobs/utils.py +30 -2
- sky/metrics/utils.py +62 -45
- sky/provision/instance_setup.py +32 -10
- sky/provision/kubernetes/utils.py +4 -1
- sky/provision/provisioner.py +10 -7
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/server/common.py +1 -0
- sky/server/config.py +2 -0
- sky/server/metrics.py +3 -1
- sky/server/requests/executor.py +103 -77
- sky/server/requests/requests.py +26 -11
- sky/server/server.py +16 -0
- sky/skylet/constants.py +9 -1
- sky/skylet/events.py +17 -0
- sky/skylet/skylet.py +3 -0
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/utils/context_utils.py +5 -1
- sky/utils/controller_utils.py +14 -0
- sky/utils/db/db_utils.py +2 -0
- sky/utils/db/migration_utils.py +11 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/RECORD +57 -56
- /sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → MnvNdzHHpiZG1_oKSpbxF}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → MnvNdzHHpiZG1_oKSpbxF}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/top_level.txt +0 -0
sky/execution.py
CHANGED
|
@@ -112,7 +112,6 @@ def _execute(
|
|
|
112
112
|
stages: Optional[List[Stage]] = None,
|
|
113
113
|
cluster_name: Optional[str] = None,
|
|
114
114
|
detach_setup: bool = False,
|
|
115
|
-
detach_run: bool = False,
|
|
116
115
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
117
116
|
no_setup: bool = False,
|
|
118
117
|
clone_disk_from: Optional[str] = None,
|
|
@@ -157,8 +156,6 @@ def _execute(
|
|
|
157
156
|
job itself. You can safely ctrl-c to detach from logging, and it will
|
|
158
157
|
not interrupt the setup process. To see the logs again after detaching,
|
|
159
158
|
use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
|
|
160
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
|
161
|
-
function and do not stream execution logs.
|
|
162
159
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
|
163
160
|
autostop after this many minutes of idleness.
|
|
164
161
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
|
@@ -217,7 +214,6 @@ def _execute(
|
|
|
217
214
|
stages=stages,
|
|
218
215
|
cluster_name=cluster_name,
|
|
219
216
|
detach_setup=detach_setup,
|
|
220
|
-
detach_run=detach_run,
|
|
221
217
|
no_setup=no_setup,
|
|
222
218
|
clone_disk_from=clone_disk_from,
|
|
223
219
|
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
|
@@ -239,7 +235,6 @@ def _execute_dag(
|
|
|
239
235
|
stages: Optional[List[Stage]],
|
|
240
236
|
cluster_name: Optional[str],
|
|
241
237
|
detach_setup: bool,
|
|
242
|
-
detach_run: bool,
|
|
243
238
|
no_setup: bool,
|
|
244
239
|
clone_disk_from: Optional[str],
|
|
245
240
|
skip_unnecessary_provisioning: bool,
|
|
@@ -507,10 +502,7 @@ def _execute_dag(
|
|
|
507
502
|
if Stage.EXEC in stages:
|
|
508
503
|
try:
|
|
509
504
|
global_user_state.update_last_use(handle.get_cluster_name())
|
|
510
|
-
job_id = backend.execute(handle,
|
|
511
|
-
task,
|
|
512
|
-
detach_run,
|
|
513
|
-
dryrun=dryrun)
|
|
505
|
+
job_id = backend.execute(handle, task, dryrun=dryrun)
|
|
514
506
|
finally:
|
|
515
507
|
# Enables post_execute() to be run after KeyboardInterrupt.
|
|
516
508
|
backend.post_execute(handle, down)
|
|
@@ -707,7 +699,6 @@ def launch(
|
|
|
707
699
|
stages=stages,
|
|
708
700
|
cluster_name=cluster_name,
|
|
709
701
|
detach_setup=detach_setup,
|
|
710
|
-
detach_run=True,
|
|
711
702
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
712
703
|
no_setup=no_setup,
|
|
713
704
|
clone_disk_from=clone_disk_from,
|
|
@@ -802,6 +793,5 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
802
793
|
Stage.EXEC,
|
|
803
794
|
],
|
|
804
795
|
cluster_name=cluster_name,
|
|
805
|
-
detach_run=True,
|
|
806
796
|
job_logger=job_logger,
|
|
807
797
|
)
|
sky/global_user_state.py
CHANGED
|
@@ -2495,11 +2495,22 @@ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
|
|
|
2495
2495
|
# on the local file system and migrate it to the database.
|
|
2496
2496
|
# TODO(syang): remove this check once we have a way to migrate the
|
|
2497
2497
|
# cluster from file to database. Remove on v0.12.0.
|
|
2498
|
-
if cluster_yaml_path is not None
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2498
|
+
if cluster_yaml_path is not None:
|
|
2499
|
+
# First try the exact path
|
|
2500
|
+
path_to_read = None
|
|
2501
|
+
if os.path.exists(cluster_yaml_path):
|
|
2502
|
+
path_to_read = cluster_yaml_path
|
|
2503
|
+
# Fallback: try with .debug suffix (when debug logging was enabled)
|
|
2504
|
+
# Debug logging causes YAML files to be saved with .debug suffix
|
|
2505
|
+
# but the path stored in the handle doesn't include it
|
|
2506
|
+
debug_path = cluster_yaml_path + '.debug'
|
|
2507
|
+
if os.path.exists(debug_path):
|
|
2508
|
+
path_to_read = debug_path
|
|
2509
|
+
if path_to_read is not None:
|
|
2510
|
+
with open(path_to_read, 'r', encoding='utf-8') as f:
|
|
2511
|
+
yaml_str = f.read()
|
|
2512
|
+
set_cluster_yaml(cluster_name, yaml_str)
|
|
2513
|
+
return yaml_str
|
|
2503
2514
|
return None
|
|
2504
2515
|
|
|
2505
2516
|
|
sky/jobs/constants.py
CHANGED
|
@@ -15,16 +15,10 @@ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
|
|
|
15
15
|
CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
|
|
16
16
|
SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
|
|
17
17
|
# Resources as a dict for the jobs controller.
|
|
18
|
-
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
|
19
|
-
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
|
20
|
-
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
|
21
|
-
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
|
22
|
-
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
|
23
|
-
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
|
24
18
|
# We use 50 GB disk size to reduce the cost.
|
|
25
19
|
CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
|
|
26
20
|
'cpus': '4+',
|
|
27
|
-
'memory': '
|
|
21
|
+
'memory': '4x',
|
|
28
22
|
'disk_size': 50
|
|
29
23
|
}
|
|
30
24
|
|
sky/jobs/controller.py
CHANGED
|
@@ -1144,7 +1144,15 @@ class Controller:
|
|
|
1144
1144
|
await asyncio.sleep(30)
|
|
1145
1145
|
continue
|
|
1146
1146
|
|
|
1147
|
-
if
|
|
1147
|
+
# Normally, 200 jobs can run on each controller. But if we have a
|
|
1148
|
+
# ton of controllers, we need to limit the number of jobs that can
|
|
1149
|
+
# run on each controller, to achieve a total of 2000 jobs across all
|
|
1150
|
+
# controllers.
|
|
1151
|
+
max_jobs = min(scheduler.MAX_JOBS_PER_WORKER,
|
|
1152
|
+
(scheduler.MAX_TOTAL_RUNNING_JOBS //
|
|
1153
|
+
scheduler.get_number_of_controllers()))
|
|
1154
|
+
|
|
1155
|
+
if len(running_tasks) >= max_jobs:
|
|
1148
1156
|
await asyncio.sleep(60)
|
|
1149
1157
|
continue
|
|
1150
1158
|
|
sky/jobs/scheduler.py
CHANGED
|
@@ -63,7 +63,9 @@ from sky.jobs import state
|
|
|
63
63
|
from sky.jobs import utils as managed_job_utils
|
|
64
64
|
from sky.server import config as server_config
|
|
65
65
|
from sky.skylet import constants
|
|
66
|
+
from sky.utils import annotations
|
|
66
67
|
from sky.utils import common_utils
|
|
68
|
+
from sky.utils import controller_utils
|
|
67
69
|
from sky.utils import subprocess_utils
|
|
68
70
|
|
|
69
71
|
if typing.TYPE_CHECKING:
|
|
@@ -91,20 +93,29 @@ JOB_MEMORY_MB = 400
|
|
|
91
93
|
LAUNCHES_PER_WORKER = 8
|
|
92
94
|
# this can probably be increased to around 300-400 but keeping it lower to just
|
|
93
95
|
# to be safe
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
96
|
+
MAX_JOBS_PER_WORKER = 200
|
|
97
|
+
# Maximum number of controllers that can be running. Hard to handle more than
|
|
98
|
+
# 512 launches at once.
|
|
99
|
+
MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
|
|
100
|
+
# Limit the number of jobs that can be running at once on the entire jobs
|
|
101
|
+
# controller cluster. It's hard to handle cancellation of more than 2000 jobs at
|
|
102
|
+
# once.
|
|
103
|
+
# TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
|
|
104
|
+
# hardcoded max limit.
|
|
105
|
+
MAX_TOTAL_RUNNING_JOBS = 2000
|
|
101
106
|
# Maximum values for above constants. There will start to be lagging issues
|
|
102
107
|
# at these numbers already.
|
|
103
108
|
# JOB_MEMORY_MB = 200
|
|
104
109
|
# LAUNCHES_PER_WORKER = 16
|
|
105
110
|
# JOBS_PER_WORKER = 400
|
|
106
111
|
|
|
112
|
+
# keep 2GB reserved after the controllers
|
|
113
|
+
MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
|
|
114
|
+
|
|
115
|
+
CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
|
|
116
|
+
|
|
107
117
|
|
|
118
|
+
@annotations.lru_cache(scope='global')
|
|
108
119
|
def get_number_of_controllers() -> int:
|
|
109
120
|
"""Returns the number of controllers that should be running.
|
|
110
121
|
|
|
@@ -123,7 +134,7 @@ def get_number_of_controllers() -> int:
|
|
|
123
134
|
consolidation_mode = skypilot_config.get_nested(
|
|
124
135
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
125
136
|
|
|
126
|
-
total_memory_mb =
|
|
137
|
+
total_memory_mb = controller_utils.get_controller_mem_size_gb() * 1024
|
|
127
138
|
if consolidation_mode:
|
|
128
139
|
config = server_config.compute_server_config(deploy=True, quiet=True)
|
|
129
140
|
|
|
@@ -136,13 +147,16 @@ def get_number_of_controllers() -> int:
|
|
|
136
147
|
config.short_worker_config.burstable_parallelism) * \
|
|
137
148
|
server_config.SHORT_WORKER_MEM_GB * 1024
|
|
138
149
|
|
|
139
|
-
return
|
|
150
|
+
return min(MAX_CONTROLLERS,
|
|
151
|
+
max(1, int((total_memory_mb - used) // JOB_MEMORY_MB)))
|
|
140
152
|
else:
|
|
141
|
-
return
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
153
|
+
return min(
|
|
154
|
+
MAX_CONTROLLERS,
|
|
155
|
+
max(
|
|
156
|
+
1,
|
|
157
|
+
int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
|
|
158
|
+
((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) *
|
|
159
|
+
1024 + JOB_MEMORY_MB))))
|
|
146
160
|
|
|
147
161
|
|
|
148
162
|
def start_controller() -> None:
|
|
@@ -280,7 +294,8 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
280
294
|
common_utils.get_user_hash(), priority)
|
|
281
295
|
if state.get_ha_recovery_script(job_id) is None:
|
|
282
296
|
# the run command is just the command that called scheduler
|
|
283
|
-
run = (f'{
|
|
297
|
+
run = (f'source {env_file_path} && '
|
|
298
|
+
f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
|
|
284
299
|
f'--job-id {job_id} --env-file {env_file_path} '
|
|
285
300
|
f'--user-yaml-path {original_user_yaml_path} '
|
|
286
301
|
f'--priority {priority}')
|
sky/jobs/server/core.py
CHANGED
|
@@ -407,9 +407,12 @@ def launch(
|
|
|
407
407
|
job_identity = ''
|
|
408
408
|
if job_rank is not None:
|
|
409
409
|
job_identity = f' (rank: {job_rank})'
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
410
|
+
job_controller_postfix = (' from jobs controller' if
|
|
411
|
+
consolidation_mode_job_id is None else '')
|
|
412
|
+
logger.info(
|
|
413
|
+
f'{colorama.Fore.YELLOW}'
|
|
414
|
+
f'Launching managed job {dag.name!r}{job_identity}'
|
|
415
|
+
f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
|
|
413
416
|
|
|
414
417
|
# Launch with the api server's user hash, so that sky status does
|
|
415
418
|
# not show the owner of the controller as whatever user launched
|
|
@@ -456,6 +459,8 @@ def launch(
|
|
|
456
459
|
managed_job_state.set_ha_recovery_script(
|
|
457
460
|
consolidation_mode_job_id, run_script)
|
|
458
461
|
backend.run_on_head(local_handle, run_script)
|
|
462
|
+
ux_utils.starting_message(
|
|
463
|
+
f'Job submitted, ID: {consolidation_mode_job_id}')
|
|
459
464
|
return consolidation_mode_job_id, local_handle
|
|
460
465
|
|
|
461
466
|
if pool is None:
|
sky/jobs/utils.py
CHANGED
|
@@ -11,6 +11,7 @@ import enum
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
import pathlib
|
|
14
|
+
import re
|
|
14
15
|
import shlex
|
|
15
16
|
import textwrap
|
|
16
17
|
import time
|
|
@@ -299,8 +300,10 @@ async def get_job_status(
|
|
|
299
300
|
job_logger.info(f'Job status: {status}')
|
|
300
301
|
job_logger.info('=' * 34)
|
|
301
302
|
return status
|
|
302
|
-
except (exceptions.CommandError, grpc.RpcError,
|
|
303
|
-
|
|
303
|
+
except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
|
|
304
|
+
ValueError, TypeError) as e:
|
|
305
|
+
# Note: Each of these exceptions has some additional conditions to
|
|
306
|
+
# limit how we handle it and whether or not we catch it.
|
|
304
307
|
# Retry on k8s transient network errors. This is useful when using
|
|
305
308
|
# coreweave which may have transient network issue sometimes.
|
|
306
309
|
is_transient_error = False
|
|
@@ -319,6 +322,31 @@ async def get_job_status(
|
|
|
319
322
|
is_transient_error = True
|
|
320
323
|
elif isinstance(e, grpc.FutureTimeoutError):
|
|
321
324
|
detailed_reason = 'Timeout'
|
|
325
|
+
# TODO(cooperc): Gracefully handle these exceptions in the backend.
|
|
326
|
+
elif isinstance(e, ValueError):
|
|
327
|
+
# If the cluster yaml is deleted in the middle of getting the
|
|
328
|
+
# SSH credentials, we could see this. See
|
|
329
|
+
# sky/global_user_state.py get_cluster_yaml_dict.
|
|
330
|
+
if re.search(r'Cluster yaml .* not found', str(e)):
|
|
331
|
+
detailed_reason = 'Cluster yaml was deleted'
|
|
332
|
+
else:
|
|
333
|
+
raise
|
|
334
|
+
elif isinstance(e, TypeError):
|
|
335
|
+
# We will grab the SSH credentials from the cluster yaml, but if
|
|
336
|
+
# handle.cluster_yaml is None, we will just return an empty dict
|
|
337
|
+
# for the credentials. See
|
|
338
|
+
# backend_utils.ssh_credential_from_yaml. Then, the credentials
|
|
339
|
+
# are passed as kwargs to SSHCommandRunner.__init__ - see
|
|
340
|
+
# cloud_vm_ray_backend.get_command_runners. So we can hit this
|
|
341
|
+
# TypeError if the cluster yaml is removed from the handle right
|
|
342
|
+
# when we pull it before the cluster is fully deleted.
|
|
343
|
+
error_msg_to_check = (
|
|
344
|
+
'SSHCommandRunner.__init__() missing 2 required positional '
|
|
345
|
+
'arguments: \'ssh_user\' and \'ssh_private_key\'')
|
|
346
|
+
if str(e) == error_msg_to_check:
|
|
347
|
+
detailed_reason = 'SSH credentials were already cleaned up'
|
|
348
|
+
else:
|
|
349
|
+
raise
|
|
322
350
|
if is_transient_error:
|
|
323
351
|
logger.info('Failed to connect to the cluster. Retrying '
|
|
324
352
|
f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
|
sky/metrics/utils.py
CHANGED
|
@@ -11,7 +11,9 @@ from typing import List, Optional, Tuple
|
|
|
11
11
|
import httpx
|
|
12
12
|
import prometheus_client as prom
|
|
13
13
|
|
|
14
|
+
from sky import sky_logging
|
|
14
15
|
from sky.skylet import constants
|
|
16
|
+
from sky.utils import common_utils
|
|
15
17
|
from sky.utils import context_utils
|
|
16
18
|
|
|
17
19
|
_SELECT_TIMEOUT = 1
|
|
@@ -35,6 +37,8 @@ _MEM_BUCKETS = [
|
|
|
35
37
|
float('inf'),
|
|
36
38
|
]
|
|
37
39
|
|
|
40
|
+
logger = sky_logging.init_logger(__name__)
|
|
41
|
+
|
|
38
42
|
# Whether the metrics are enabled, cannot be changed at runtime.
|
|
39
43
|
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
40
44
|
'false').lower() == 'true'
|
|
@@ -188,53 +192,61 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
188
192
|
if 'KUBECONFIG' not in env:
|
|
189
193
|
env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
|
|
190
194
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
stdout=subprocess.PIPE,
|
|
194
|
-
stderr=subprocess.STDOUT,
|
|
195
|
-
text=True,
|
|
196
|
-
env=env)
|
|
197
|
-
|
|
195
|
+
port_forward_process = None
|
|
196
|
+
port_forward_exit = False
|
|
198
197
|
local_port = None
|
|
199
|
-
start_time = time.time()
|
|
200
|
-
|
|
201
|
-
buffer = ''
|
|
202
|
-
# wait for the port forward to start and extract the local port
|
|
203
|
-
while time.time() - start_time < start_port_forward_timeout:
|
|
204
|
-
if port_forward_process.poll() is not None:
|
|
205
|
-
# port forward process has terminated
|
|
206
|
-
if port_forward_process.returncode != 0:
|
|
207
|
-
raise RuntimeError(
|
|
208
|
-
f'Port forward failed for service {service} in namespace '
|
|
209
|
-
f'{namespace} on context {context}')
|
|
210
|
-
break
|
|
211
|
-
|
|
212
|
-
# read output line by line to find the local port
|
|
213
|
-
if port_forward_process.stdout:
|
|
214
|
-
# Wait up to 1s for data to be available without blocking
|
|
215
|
-
r, _, _ = select.select([port_forward_process.stdout], [], [],
|
|
216
|
-
_SELECT_TIMEOUT)
|
|
217
|
-
if r:
|
|
218
|
-
# Read available bytes from the FD without blocking
|
|
219
|
-
fd = port_forward_process.stdout.fileno()
|
|
220
|
-
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
221
|
-
chunk = raw.decode(errors='ignore')
|
|
222
|
-
buffer += chunk
|
|
223
|
-
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
|
|
224
|
-
if match:
|
|
225
|
-
local_port = int(match.group(1))
|
|
226
|
-
break
|
|
227
|
-
|
|
228
|
-
# sleep for 100ms to avoid busy-waiting
|
|
229
|
-
time.sleep(0.1)
|
|
230
198
|
|
|
199
|
+
try:
|
|
200
|
+
# start the port forward process
|
|
201
|
+
port_forward_process = subprocess.Popen(cmd,
|
|
202
|
+
stdout=subprocess.PIPE,
|
|
203
|
+
stderr=subprocess.STDOUT,
|
|
204
|
+
text=True,
|
|
205
|
+
env=env)
|
|
206
|
+
|
|
207
|
+
start_time = time.time()
|
|
208
|
+
|
|
209
|
+
buffer = ''
|
|
210
|
+
# wait for the port forward to start and extract the local port
|
|
211
|
+
while time.time() - start_time < start_port_forward_timeout:
|
|
212
|
+
if port_forward_process.poll() is not None:
|
|
213
|
+
# port forward process has terminated
|
|
214
|
+
if port_forward_process.returncode != 0:
|
|
215
|
+
port_forward_exit = True
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
# read output line by line to find the local port
|
|
219
|
+
if port_forward_process.stdout:
|
|
220
|
+
# Wait up to 1s for data to be available without blocking
|
|
221
|
+
r, _, _ = select.select([port_forward_process.stdout], [], [],
|
|
222
|
+
_SELECT_TIMEOUT)
|
|
223
|
+
if r:
|
|
224
|
+
# Read available bytes from the FD without blocking
|
|
225
|
+
fd = port_forward_process.stdout.fileno()
|
|
226
|
+
raw = os.read(fd, _SELECT_BUFFER_SIZE)
|
|
227
|
+
chunk = raw.decode(errors='ignore')
|
|
228
|
+
buffer += chunk
|
|
229
|
+
match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)',
|
|
230
|
+
buffer)
|
|
231
|
+
if match:
|
|
232
|
+
local_port = int(match.group(1))
|
|
233
|
+
break
|
|
234
|
+
|
|
235
|
+
# sleep for 100ms to avoid busy-waiting
|
|
236
|
+
time.sleep(0.1)
|
|
237
|
+
except BaseException: # pylint: disable=broad-exception-caught
|
|
238
|
+
if port_forward_process:
|
|
239
|
+
stop_svc_port_forward(port_forward_process,
|
|
240
|
+
timeout=terminate_port_forward_timeout)
|
|
241
|
+
raise
|
|
242
|
+
if port_forward_exit:
|
|
243
|
+
raise RuntimeError(f'Port forward failed for service {service} in '
|
|
244
|
+
f'namespace {namespace} on context {context}')
|
|
231
245
|
if local_port is None:
|
|
232
246
|
try:
|
|
233
|
-
port_forward_process
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
port_forward_process.kill()
|
|
237
|
-
port_forward_process.wait()
|
|
247
|
+
if port_forward_process:
|
|
248
|
+
stop_svc_port_forward(port_forward_process,
|
|
249
|
+
timeout=terminate_port_forward_timeout)
|
|
238
250
|
finally:
|
|
239
251
|
raise RuntimeError(
|
|
240
252
|
f'Failed to extract local port for service {service} in '
|
|
@@ -243,14 +255,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
|
|
|
243
255
|
return port_forward_process, local_port
|
|
244
256
|
|
|
245
257
|
|
|
246
|
-
def stop_svc_port_forward(port_forward_process: subprocess.Popen
|
|
258
|
+
def stop_svc_port_forward(port_forward_process: subprocess.Popen,
|
|
259
|
+
timeout: int = 5) -> None:
|
|
247
260
|
"""Stops a port forward to a service in a Kubernetes cluster.
|
|
248
261
|
Args:
|
|
249
262
|
port_forward_process: The subprocess.Popen process to terminate
|
|
250
263
|
"""
|
|
251
264
|
try:
|
|
252
265
|
port_forward_process.terminate()
|
|
253
|
-
port_forward_process.wait(timeout=
|
|
266
|
+
port_forward_process.wait(timeout=timeout)
|
|
254
267
|
except subprocess.TimeoutExpired:
|
|
255
268
|
port_forward_process.kill()
|
|
256
269
|
port_forward_process.wait()
|
|
@@ -301,6 +314,10 @@ async def send_metrics_request_with_port_forward(
|
|
|
301
314
|
response.raise_for_status()
|
|
302
315
|
return response.text
|
|
303
316
|
|
|
317
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
318
|
+
logger.error(f'Failed to send metrics request with port forward: '
|
|
319
|
+
f'{common_utils.format_exception(e)}')
|
|
320
|
+
raise
|
|
304
321
|
finally:
|
|
305
322
|
# Always clean up port forward
|
|
306
323
|
if port_forward_process:
|
sky/provision/instance_setup.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
10
10
|
from sky import exceptions
|
|
11
11
|
from sky import logs
|
|
12
12
|
from sky import provision
|
|
13
|
+
from sky import resources as resources_lib
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky.provision import common
|
|
15
16
|
from sky.provision import docker_utils
|
|
@@ -92,12 +93,6 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
92
93
|
f'{usage_constants.USAGE_RUN_ID_FILE}')
|
|
93
94
|
|
|
94
95
|
|
|
95
|
-
def _set_skypilot_env_var_cmd() -> str:
|
|
96
|
-
"""Sets the skypilot environment variables on the remote machine."""
|
|
97
|
-
env_vars = env_options.Options.all_options()
|
|
98
|
-
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
99
|
-
|
|
100
|
-
|
|
101
96
|
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
|
|
102
97
|
"""Decorator that retries the function if it fails.
|
|
103
98
|
|
|
@@ -482,11 +477,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
482
477
|
@common.log_function_start_end
|
|
483
478
|
@_auto_retry()
|
|
484
479
|
@timeline.event
|
|
485
|
-
def start_skylet_on_head_node(
|
|
486
|
-
|
|
487
|
-
|
|
480
|
+
def start_skylet_on_head_node(
|
|
481
|
+
cluster_name: resources_utils.ClusterName,
|
|
482
|
+
cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
|
|
483
|
+
launched_resources: resources_lib.Resources) -> None:
|
|
488
484
|
"""Start skylet on the head node."""
|
|
489
|
-
|
|
485
|
+
# Avoid circular import.
|
|
486
|
+
# pylint: disable=import-outside-toplevel
|
|
487
|
+
from sky.utils import controller_utils
|
|
488
|
+
|
|
489
|
+
def _set_skypilot_env_var_cmd() -> str:
|
|
490
|
+
"""Sets the skypilot environment variables on the remote machine."""
|
|
491
|
+
env_vars = {
|
|
492
|
+
k: str(v) for (k, v) in env_options.Options.all_options().items()
|
|
493
|
+
}
|
|
494
|
+
is_controller = controller_utils.Controllers.from_name(
|
|
495
|
+
cluster_name.display_name) is not None
|
|
496
|
+
is_kubernetes = cluster_info.provider_name == 'kubernetes'
|
|
497
|
+
if is_controller and is_kubernetes:
|
|
498
|
+
# For jobs/serve controller, we pass in the CPU and memory limits
|
|
499
|
+
# when starting the skylet to handle cases where these env vars
|
|
500
|
+
# are not set on the cluster's pod spec. The skylet will read
|
|
501
|
+
# these env vars when starting (ManagedJobEvent.start()) and write
|
|
502
|
+
# it to disk.
|
|
503
|
+
resources = launched_resources.assert_launchable()
|
|
504
|
+
vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
|
|
505
|
+
resources.instance_type)
|
|
506
|
+
if vcpus is not None:
|
|
507
|
+
env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
|
|
508
|
+
if mem is not None:
|
|
509
|
+
env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
|
|
510
|
+
return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
|
|
511
|
+
|
|
490
512
|
runners = provision.get_command_runners(cluster_info.provider_name,
|
|
491
513
|
cluster_info, **ssh_credentials)
|
|
492
514
|
head_runner = runners[0]
|
|
@@ -1688,7 +1688,10 @@ def check_credentials(context: Optional[str],
|
|
|
1688
1688
|
try:
|
|
1689
1689
|
namespace = get_kube_config_context_namespace(context)
|
|
1690
1690
|
kubernetes.core_api(context).list_namespaced_pod(
|
|
1691
|
-
namespace, _request_timeout=timeout)
|
|
1691
|
+
namespace, limit=1, _request_timeout=timeout)
|
|
1692
|
+
# This call is "free" because this function is a cached call,
|
|
1693
|
+
# and it will not be called again in this function.
|
|
1694
|
+
get_kubernetes_nodes(context=context)
|
|
1692
1695
|
except ImportError:
|
|
1693
1696
|
# TODO(romilb): Update these error strs to also include link to docs
|
|
1694
1697
|
# when docs are ready.
|
sky/provision/provisioner.py
CHANGED
|
@@ -18,6 +18,7 @@ from sky import exceptions
|
|
|
18
18
|
from sky import global_user_state
|
|
19
19
|
from sky import logs
|
|
20
20
|
from sky import provision
|
|
21
|
+
from sky import resources as resources_lib
|
|
21
22
|
from sky import sky_logging
|
|
22
23
|
from sky import skypilot_config
|
|
23
24
|
from sky.adaptors import aws
|
|
@@ -428,13 +429,14 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
|
428
429
|
|
|
429
430
|
|
|
430
431
|
def _post_provision_setup(
|
|
431
|
-
|
|
432
|
-
handle_cluster_yaml: str,
|
|
432
|
+
launched_resources: resources_lib.Resources,
|
|
433
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
433
434
|
provision_record: provision_common.ProvisionRecord,
|
|
434
435
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
|
435
436
|
config_from_yaml = global_user_state.get_cluster_yaml_dict(
|
|
436
437
|
handle_cluster_yaml)
|
|
437
438
|
provider_config = config_from_yaml.get('provider')
|
|
439
|
+
cloud_name = repr(launched_resources.cloud)
|
|
438
440
|
cluster_info = provision.get_cluster_info(cloud_name,
|
|
439
441
|
provision_record.region,
|
|
440
442
|
cluster_name.name_on_cloud,
|
|
@@ -694,8 +696,9 @@ def _post_provision_setup(
|
|
|
694
696
|
cluster_info,
|
|
695
697
|
ssh_credentials)
|
|
696
698
|
|
|
697
|
-
instance_setup.start_skylet_on_head_node(cluster_name
|
|
698
|
-
|
|
699
|
+
instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
|
|
700
|
+
ssh_credentials,
|
|
701
|
+
launched_resources)
|
|
699
702
|
|
|
700
703
|
logger.info(
|
|
701
704
|
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
|
@@ -706,8 +709,8 @@ def _post_provision_setup(
|
|
|
706
709
|
|
|
707
710
|
@timeline.event
|
|
708
711
|
def post_provision_runtime_setup(
|
|
709
|
-
|
|
710
|
-
handle_cluster_yaml: str,
|
|
712
|
+
launched_resources: resources_lib.Resources,
|
|
713
|
+
cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
|
|
711
714
|
provision_record: provision_common.ProvisionRecord,
|
|
712
715
|
custom_resource: Optional[str],
|
|
713
716
|
log_dir: str) -> provision_common.ClusterInfo:
|
|
@@ -728,7 +731,7 @@ def post_provision_runtime_setup(
|
|
|
728
731
|
try:
|
|
729
732
|
logger.debug(_TITLE.format('System Setup After Provision'))
|
|
730
733
|
return _post_provision_setup(
|
|
731
|
-
|
|
734
|
+
launched_resources,
|
|
732
735
|
cluster_name,
|
|
733
736
|
handle_cluster_yaml=handle_cluster_yaml,
|
|
734
737
|
provision_record=provision_record,
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Add ssh keys in filesystem to global user state.
|
|
2
|
+
|
|
3
|
+
Revision ID: 010
|
|
4
|
+
Revises: 009
|
|
5
|
+
Create Date: 2025-10-07
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
import glob
|
|
9
|
+
# pylint: disable=invalid-name
|
|
10
|
+
import os
|
|
11
|
+
from typing import Sequence, Union
|
|
12
|
+
|
|
13
|
+
from alembic import op
|
|
14
|
+
import sqlalchemy as sa
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '010'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '009'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add last_activity_time and launched_at columns to cluster history."""
|
|
25
|
+
connection = op.get_bind()
|
|
26
|
+
|
|
27
|
+
match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
|
|
28
|
+
file_user_hashes = set()
|
|
29
|
+
for match_dir in match_dirs:
|
|
30
|
+
user_hash = match_dir.split('/')[-2]
|
|
31
|
+
file_user_hashes.add(user_hash)
|
|
32
|
+
|
|
33
|
+
# Get all existing ssh keys
|
|
34
|
+
existing_user_hashes = set()
|
|
35
|
+
result = connection.execute(sa.text('SELECT user_hash FROM ssh_key'))
|
|
36
|
+
for row in result:
|
|
37
|
+
existing_user_hashes.add(row[0])
|
|
38
|
+
|
|
39
|
+
user_hashes_to_add = file_user_hashes - existing_user_hashes
|
|
40
|
+
for user_hash in user_hashes_to_add:
|
|
41
|
+
match_dir = os.path.join(os.path.expanduser('~/.sky/clients'),
|
|
42
|
+
user_hash, 'ssh')
|
|
43
|
+
public_key_path = os.path.join(match_dir, 'sky-key.pub')
|
|
44
|
+
private_key_path = os.path.join(match_dir, 'sky-key')
|
|
45
|
+
try:
|
|
46
|
+
with open(public_key_path, 'r', encoding='utf-8') as f:
|
|
47
|
+
public_key = f.read().strip()
|
|
48
|
+
with open(private_key_path, 'r', encoding='utf-8') as f:
|
|
49
|
+
private_key = f.read().strip()
|
|
50
|
+
except FileNotFoundError:
|
|
51
|
+
# Skip if the key files are not found
|
|
52
|
+
continue
|
|
53
|
+
connection.execute(
|
|
54
|
+
sa.text('INSERT INTO ssh_key '
|
|
55
|
+
'(user_hash, ssh_public_key, ssh_private_key) '
|
|
56
|
+
'VALUES (:user_hash, :ssh_public_key, :ssh_private_key) '
|
|
57
|
+
'ON CONFLICT DO NOTHING'), {
|
|
58
|
+
'user_hash': user_hash,
|
|
59
|
+
'ssh_public_key': public_key,
|
|
60
|
+
'ssh_private_key': private_key
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def downgrade():
|
|
65
|
+
"""No-op for backward compatibility."""
|
|
66
|
+
pass
|
sky/server/common.py
CHANGED
|
@@ -950,6 +950,7 @@ def clear_local_api_server_database() -> None:
|
|
|
950
950
|
db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
|
|
951
951
|
for extension in ['', '-shm', '-wal']:
|
|
952
952
|
try:
|
|
953
|
+
logger.debug(f'Removing database file {db_path}{extension}')
|
|
953
954
|
os.remove(f'{db_path}{extension}')
|
|
954
955
|
except FileNotFoundError:
|
|
955
956
|
logger.debug(f'Database file {db_path}{extension} not found.')
|
sky/server/config.py
CHANGED
|
@@ -111,7 +111,9 @@ def compute_server_config(deploy: bool,
|
|
|
111
111
|
process after API server was introduced.
|
|
112
112
|
"""
|
|
113
113
|
cpu_count = common_utils.get_cpu_count()
|
|
114
|
+
logger.debug(f'CPU count: {cpu_count}')
|
|
114
115
|
mem_size_gb = common_utils.get_mem_size_gb()
|
|
116
|
+
logger.debug(f'Memory size: {mem_size_gb}GB')
|
|
115
117
|
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
|
116
118
|
mem_size_gb,
|
|
117
119
|
local=not deploy)
|