skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +207 -79
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +112 -53
- sky/client/common.py +4 -2
- sky/client/sdk.py +17 -7
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +9 -54
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +271 -67
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +11 -7
- sky/jobs/server/core.py +5 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +32 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +5 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +24 -18
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/core.py +1 -0
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +35 -28
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/setup_files/dependencies.py
CHANGED
|
@@ -79,7 +79,7 @@ install_requires = [
|
|
|
79
79
|
# Required for API server metrics
|
|
80
80
|
'prometheus_client>=0.8.0',
|
|
81
81
|
'passlib',
|
|
82
|
-
'bcrypt',
|
|
82
|
+
'bcrypt==4.0.1',
|
|
83
83
|
'pyjwt',
|
|
84
84
|
'gitpython',
|
|
85
85
|
'types-paramiko',
|
|
@@ -112,6 +112,7 @@ server_dependencies = [
|
|
|
112
112
|
GRPC,
|
|
113
113
|
PROTOBUF,
|
|
114
114
|
'aiosqlite',
|
|
115
|
+
'greenlet',
|
|
115
116
|
]
|
|
116
117
|
|
|
117
118
|
local_ray = [
|
|
@@ -192,7 +193,9 @@ extras_require: Dict[str, List[str]] = {
|
|
|
192
193
|
'remote': remote,
|
|
193
194
|
# For the container registry auth api. Reference:
|
|
194
195
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
195
|
-
|
|
196
|
+
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
|
|
197
|
+
# stdlib provides tomllib; on lower versions we depend on tomli explicitly.
|
|
198
|
+
'runpod': ['runpod>=1.6.1', 'tomli; python_version < "3.11"'],
|
|
196
199
|
'fluidstack': [], # No dependencies needed for fluidstack
|
|
197
200
|
'cudo': ['cudo-compute>=0.1.10'],
|
|
198
201
|
'paperspace': [], # No dependencies needed for paperspace
|
sky/setup_files/setup.py
CHANGED
|
@@ -148,47 +148,47 @@ if os.path.exists(readme_filepath):
|
|
|
148
148
|
long_description = io.open(readme_filepath, 'r', encoding='utf-8').read()
|
|
149
149
|
long_description = parse_readme(long_description)
|
|
150
150
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
setuptools.setup(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
151
|
+
if __name__ == '__main__':
|
|
152
|
+
atexit.register(revert_commit_hash)
|
|
153
|
+
replace_commit_hash()
|
|
154
|
+
setuptools.setup(
|
|
155
|
+
# NOTE: this affects the package.whl wheel name. When changing this (if
|
|
156
|
+
# ever), you must grep for '.whl' and change all corresponding wheel paths
|
|
157
|
+
# (templates/*.j2 and wheel_utils.py).
|
|
158
|
+
name='skypilot-nightly',
|
|
159
|
+
version=find_version(),
|
|
160
|
+
packages=setuptools.find_packages(),
|
|
161
|
+
author='SkyPilot Team',
|
|
162
|
+
license='Apache 2.0',
|
|
163
|
+
readme='README.md',
|
|
164
|
+
description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
|
|
165
|
+
long_description=long_description,
|
|
166
|
+
long_description_content_type='text/markdown',
|
|
167
|
+
setup_requires=['wheel'],
|
|
168
|
+
requires_python='>=3.7',
|
|
169
|
+
install_requires=dependencies['install_requires'],
|
|
170
|
+
extras_require=dependencies['extras_require'],
|
|
171
|
+
entry_points={
|
|
172
|
+
'console_scripts': ['sky = sky.cli:cli'],
|
|
173
|
+
},
|
|
174
|
+
include_package_data=True,
|
|
175
|
+
classifiers=[
|
|
176
|
+
'Programming Language :: Python :: 3.7',
|
|
177
|
+
'Programming Language :: Python :: 3.8',
|
|
178
|
+
'Programming Language :: Python :: 3.9',
|
|
179
|
+
'Programming Language :: Python :: 3.10',
|
|
180
|
+
'Programming Language :: Python :: 3.11',
|
|
181
|
+
'Programming Language :: Python :: 3.12',
|
|
182
|
+
'Programming Language :: Python :: 3.13',
|
|
183
|
+
'License :: OSI Approved :: Apache Software License',
|
|
184
|
+
'Operating System :: OS Independent',
|
|
185
|
+
'Topic :: Software Development :: Libraries :: Python Modules',
|
|
186
|
+
'Topic :: System :: Distributed Computing',
|
|
187
|
+
],
|
|
188
|
+
project_urls={
|
|
189
|
+
'Homepage': 'https://github.com/skypilot-org/skypilot',
|
|
190
|
+
'Issues': 'https://github.com/skypilot-org/skypilot/issues',
|
|
191
|
+
'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
|
|
192
|
+
'Documentation': 'https://docs.skypilot.co/',
|
|
193
|
+
},
|
|
194
|
+
)
|
sky/skylet/constants.py
CHANGED
|
@@ -57,6 +57,9 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
|
57
57
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
58
58
|
# uv is used for venv and pip, much faster than python implementations.
|
|
59
59
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
60
|
+
# set UV_SYSTEM_PYTHON to false in case the
|
|
61
|
+
# user provided docker image set it to true.
|
|
62
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
60
63
|
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
61
64
|
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
62
65
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
@@ -97,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
97
100
|
# cluster yaml is updated.
|
|
98
101
|
#
|
|
99
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
100
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '19'
|
|
101
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
102
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
103
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/events.py
CHANGED
|
@@ -11,6 +11,7 @@ import psutil
|
|
|
11
11
|
from sky import clouds
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.backends import cloud_vm_ray_backend
|
|
14
|
+
from sky.jobs import constants as managed_job_constants
|
|
14
15
|
from sky.jobs import scheduler
|
|
15
16
|
from sky.jobs import state as managed_job_state
|
|
16
17
|
from sky.jobs import utils as managed_job_utils
|
|
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
|
|
|
21
22
|
from sky.usage import usage_lib
|
|
22
23
|
from sky.utils import cluster_utils
|
|
23
24
|
from sky.utils import registry
|
|
25
|
+
from sky.utils import subprocess_utils
|
|
24
26
|
from sky.utils import ux_utils
|
|
25
27
|
from sky.utils import yaml_utils
|
|
26
28
|
|
|
@@ -74,6 +76,46 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
74
76
|
EVENT_INTERVAL_SECONDS = 300
|
|
75
77
|
|
|
76
78
|
def _run(self):
|
|
79
|
+
if not os.path.exists(
|
|
80
|
+
os.path.expanduser(
|
|
81
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)):
|
|
82
|
+
# Note: since the skylet is started before the user setup (in
|
|
83
|
+
# jobs-controller.yaml.j2) runs, it's possible that we hit this
|
|
84
|
+
# before the indicator file is written. However, since we will wait
|
|
85
|
+
# EVENT_INTERVAL_SECONDS before the first run, this should be very
|
|
86
|
+
# unlikely.
|
|
87
|
+
logger.info('No jobs controller indicator file found.')
|
|
88
|
+
all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
|
|
89
|
+
if not all_job_ids:
|
|
90
|
+
logger.info('No jobs running. Stopping controllers.')
|
|
91
|
+
# TODO(cooperc): Move this to a shared function also called by
|
|
92
|
+
# sdk.api_stop(). (#7229)
|
|
93
|
+
try:
|
|
94
|
+
with open(os.path.expanduser(
|
|
95
|
+
scheduler.JOB_CONTROLLER_PID_PATH),
|
|
96
|
+
'r',
|
|
97
|
+
encoding='utf-8') as f:
|
|
98
|
+
pids = f.read().split('\n')[:-1]
|
|
99
|
+
for pid in pids:
|
|
100
|
+
if subprocess_utils.is_process_alive(
|
|
101
|
+
int(pid.strip())):
|
|
102
|
+
subprocess_utils.kill_children_processes(
|
|
103
|
+
parent_pids=[int(pid.strip())], force=True)
|
|
104
|
+
os.remove(
|
|
105
|
+
os.path.expanduser(scheduler.JOB_CONTROLLER_PID_PATH))
|
|
106
|
+
except FileNotFoundError:
|
|
107
|
+
# its fine we will create it
|
|
108
|
+
pass
|
|
109
|
+
except Exception as e: # pylint: disable=broad-except
|
|
110
|
+
# in case we get perm issues or something is messed up, just
|
|
111
|
+
# ignore it and assume the process is dead
|
|
112
|
+
logger.error(
|
|
113
|
+
f'Error looking at job controller pid file: {e}')
|
|
114
|
+
pass
|
|
115
|
+
logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
|
|
116
|
+
'indicator file hasn\'t been written yet.')
|
|
117
|
+
return
|
|
118
|
+
|
|
77
119
|
logger.info('=== Updating managed job status ===')
|
|
78
120
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
121
|
scheduler.maybe_start_controllers()
|
|
@@ -36,6 +36,9 @@ setup: |
|
|
|
36
36
|
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
|
37
37
|
{% endif %}
|
|
38
38
|
|
|
39
|
+
# This is used by the skylet events to check if we are a jobs controller.
|
|
40
|
+
touch {{job_controller_indicator_file}}
|
|
41
|
+
|
|
39
42
|
run: |
|
|
40
43
|
{%- if consolidation_mode_job_id is none %}
|
|
41
44
|
{{ sky_activate_python_env }}
|
|
@@ -510,6 +510,16 @@ available_node_types:
|
|
|
510
510
|
valueFrom:
|
|
511
511
|
fieldRef:
|
|
512
512
|
fieldPath: metadata.labels['ray-node-type']
|
|
513
|
+
- name: SKYPILOT_POD_CPU_CORE_LIMIT
|
|
514
|
+
valueFrom:
|
|
515
|
+
resourceFieldRef:
|
|
516
|
+
containerName: ray-node
|
|
517
|
+
resource: requests.cpu
|
|
518
|
+
- name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
|
|
519
|
+
valueFrom:
|
|
520
|
+
resourceFieldRef:
|
|
521
|
+
containerName: ray-node
|
|
522
|
+
resource: requests.memory
|
|
513
523
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
514
524
|
- name: {{ key }}
|
|
515
525
|
value: {{ value }}
|
|
@@ -630,13 +640,6 @@ available_node_types:
|
|
|
630
640
|
command: ["/bin/bash", "-c", "--"]
|
|
631
641
|
args:
|
|
632
642
|
- |
|
|
633
|
-
# For backwards compatibility, we put a marker file in the pod
|
|
634
|
-
# to indicate that the pod is running with the changes introduced
|
|
635
|
-
# in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
|
|
636
|
-
# TODO: Remove this marker file and it's usage in setup_commands
|
|
637
|
-
# after v0.10.0 release.
|
|
638
|
-
touch /tmp/skypilot_is_nimbus
|
|
639
|
-
|
|
640
643
|
# Helper function to conditionally use sudo
|
|
641
644
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
|
642
645
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
|
@@ -898,15 +901,20 @@ available_node_types:
|
|
|
898
901
|
{{ conda_installation_commands }}
|
|
899
902
|
{{ ray_installation_commands }}
|
|
900
903
|
|
|
901
|
-
|
|
904
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
905
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
906
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
902
907
|
# Wait for `patch` package to be installed before applying ray patches
|
|
903
908
|
until dpkg -l | grep -q "^ii patch "; do
|
|
904
909
|
sleep 0.1
|
|
905
910
|
echo "Waiting for patch package to be installed..."
|
|
906
911
|
done
|
|
907
912
|
# Apply Ray patches for progress bar fix
|
|
908
|
-
|
|
909
|
-
|
|
913
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
914
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
915
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
916
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
917
|
+
$(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
910
918
|
}
|
|
911
919
|
touch /tmp/ray_skypilot_installation_complete
|
|
912
920
|
echo "=== Ray and skypilot installation completed ==="
|
|
@@ -1333,18 +1341,16 @@ setup_commands:
|
|
|
1333
1341
|
# Wait for SSH setup to complete before proceeding
|
|
1334
1342
|
if [ -f /tmp/apt_ssh_setup_started ]; then
|
|
1335
1343
|
echo "=== Logs for asynchronous SSH setup ===";
|
|
1336
|
-
[ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
|
|
1337
|
-
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1344
|
+
([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
|
|
1345
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1338
1346
|
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
|
1339
1347
|
fi
|
|
1340
1348
|
|
|
1341
1349
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1347
|
-
fi
|
|
1350
|
+
([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
|
|
1351
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
|
1352
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
|
1353
|
+
|
|
1348
1354
|
end_epoch=$(date +%s);
|
|
1349
1355
|
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
|
1350
1356
|
start_epoch=$(date +%s);
|
sky/usage/usage_lib.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
|
|
|
14
14
|
|
|
15
15
|
import sky
|
|
16
16
|
from sky import sky_logging
|
|
17
|
+
from sky import skypilot_config
|
|
17
18
|
from sky.adaptors import common as adaptors_common
|
|
18
19
|
from sky.usage import constants
|
|
19
20
|
from sky.utils import common_utils
|
|
@@ -167,6 +168,7 @@ class UsageMessageToReport(MessageToReport):
|
|
|
167
168
|
self.runtimes: Dict[str, float] = {} # update_runtime
|
|
168
169
|
self.exception: Optional[str] = None # entrypoint_context
|
|
169
170
|
self.stacktrace: Optional[str] = None # entrypoint_context
|
|
171
|
+
self.skypilot_config: Optional[Dict[str, Any]] = None
|
|
170
172
|
|
|
171
173
|
# Whether API server is deployed remotely.
|
|
172
174
|
self.using_remote_api_server: bool = (
|
|
@@ -177,6 +179,7 @@ class UsageMessageToReport(MessageToReport):
|
|
|
177
179
|
self.client_entrypoint = common_utils.get_current_client_entrypoint(
|
|
178
180
|
msg)
|
|
179
181
|
self.entrypoint = msg
|
|
182
|
+
self.skypilot_config = dict(skypilot_config.to_dict())
|
|
180
183
|
|
|
181
184
|
def set_internal(self):
|
|
182
185
|
self.internal = True
|
|
@@ -11,6 +11,7 @@ from sky.utils import common_utils
|
|
|
11
11
|
from sky.utils import log_utils
|
|
12
12
|
from sky.utils import resources_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
14
|
+
from sky.utils import ux_utils
|
|
14
15
|
|
|
15
16
|
if typing.TYPE_CHECKING:
|
|
16
17
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
|
|
|
105
106
|
|
|
106
107
|
if query_clusters:
|
|
107
108
|
cluster_names = {record['name'] for record in cluster_records}
|
|
108
|
-
not_found_clusters =
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
if cluster not in cluster_names
|
|
112
|
-
]
|
|
109
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
110
|
+
query_clusters, cluster_names)
|
|
111
|
+
not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
|
|
113
112
|
if not_found_clusters:
|
|
114
113
|
cluster_str = 'Cluster'
|
|
115
114
|
if len(not_found_clusters) > 1:
|
sky/utils/context.py
CHANGED
|
@@ -2,15 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from collections.abc import Mapping
|
|
5
|
-
from collections.abc import MutableMapping
|
|
6
5
|
import contextvars
|
|
6
|
+
import copy
|
|
7
7
|
import functools
|
|
8
|
+
import inspect
|
|
8
9
|
import os
|
|
9
10
|
import pathlib
|
|
10
11
|
import subprocess
|
|
11
12
|
import sys
|
|
12
|
-
import
|
|
13
|
-
|
|
13
|
+
from typing import (Callable, Dict, Iterator, MutableMapping, Optional, TextIO,
|
|
14
|
+
TYPE_CHECKING, TypeVar)
|
|
15
|
+
|
|
16
|
+
from typing_extensions import ParamSpec
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from sky.skypilot_config import ConfigContext
|
|
14
20
|
|
|
15
21
|
|
|
16
22
|
class Context(object):
|
|
@@ -88,7 +94,7 @@ class Context(object):
|
|
|
88
94
|
else:
|
|
89
95
|
self._log_file_handle = open(log_file, 'a', encoding='utf-8')
|
|
90
96
|
self._log_file = log_file
|
|
91
|
-
if
|
|
97
|
+
if original_log_handle is not None:
|
|
92
98
|
original_log_handle.close()
|
|
93
99
|
return original_log_file
|
|
94
100
|
|
|
@@ -102,8 +108,30 @@ class Context(object):
|
|
|
102
108
|
for k, v in envs.items():
|
|
103
109
|
self.env_overrides[k] = v
|
|
104
110
|
|
|
111
|
+
def cleanup(self):
|
|
112
|
+
"""Clean up the context."""
|
|
113
|
+
if self._log_file_handle is not None:
|
|
114
|
+
self._log_file_handle.close()
|
|
115
|
+
self._log_file_handle = None
|
|
116
|
+
|
|
117
|
+
def copy(self) -> 'Context':
|
|
118
|
+
"""Create a copy of the context.
|
|
119
|
+
|
|
120
|
+
Changes to the current context after this call will not affect the copy.
|
|
121
|
+
The new context will get its own handle/fd for the log file.
|
|
122
|
+
The new context will get an independent copy of the env var overrides.
|
|
123
|
+
The new context will get an independent copy of the config context.
|
|
124
|
+
Cancellation of the current context will not be propagated to the copy.
|
|
125
|
+
"""
|
|
126
|
+
new_context = Context()
|
|
127
|
+
new_context.redirect_log(self._log_file)
|
|
128
|
+
new_context.env_overrides = self.env_overrides.copy()
|
|
129
|
+
new_context.config_context = copy.deepcopy(self.config_context)
|
|
130
|
+
return new_context
|
|
105
131
|
|
|
106
|
-
|
|
132
|
+
|
|
133
|
+
_CONTEXT = contextvars.ContextVar[Optional[Context]]('sky_context',
|
|
134
|
+
default=None)
|
|
107
135
|
|
|
108
136
|
|
|
109
137
|
def get() -> Optional[Context]:
|
|
@@ -116,7 +144,7 @@ def get() -> Optional[Context]:
|
|
|
116
144
|
return _CONTEXT.get()
|
|
117
145
|
|
|
118
146
|
|
|
119
|
-
class ContextualEnviron(MutableMapping):
|
|
147
|
+
class ContextualEnviron(MutableMapping[str, str]):
|
|
120
148
|
"""Environment variables wrapper with contextual overrides.
|
|
121
149
|
|
|
122
150
|
An instance of ContextualEnviron will typically be used to replace
|
|
@@ -155,10 +183,10 @@ class ContextualEnviron(MutableMapping):
|
|
|
155
183
|
assert os.environ['FOO'] == 'BAR1'
|
|
156
184
|
"""
|
|
157
185
|
|
|
158
|
-
def __init__(self, environ):
|
|
186
|
+
def __init__(self, environ: 'os._Environ[str]') -> None:
|
|
159
187
|
self._environ = environ
|
|
160
188
|
|
|
161
|
-
def __getitem__(self, key):
|
|
189
|
+
def __getitem__(self, key: str) -> str:
|
|
162
190
|
ctx = get()
|
|
163
191
|
if ctx is not None:
|
|
164
192
|
if key in ctx.env_overrides:
|
|
@@ -170,10 +198,10 @@ class ContextualEnviron(MutableMapping):
|
|
|
170
198
|
return value
|
|
171
199
|
return self._environ[key]
|
|
172
200
|
|
|
173
|
-
def __iter__(self):
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
201
|
+
def __iter__(self) -> Iterator[str]:
|
|
202
|
+
|
|
203
|
+
def iter_from_context(ctx: Context) -> Iterator[str]:
|
|
204
|
+
deleted_keys = set()
|
|
177
205
|
for key, value in ctx.env_overrides.items():
|
|
178
206
|
if value is None:
|
|
179
207
|
deleted_keys.add(key)
|
|
@@ -182,20 +210,24 @@ class ContextualEnviron(MutableMapping):
|
|
|
182
210
|
# Deduplicate the keys
|
|
183
211
|
if key not in ctx.env_overrides and key not in deleted_keys:
|
|
184
212
|
yield key
|
|
213
|
+
|
|
214
|
+
ctx = get()
|
|
215
|
+
if ctx is not None:
|
|
216
|
+
return iter_from_context(ctx)
|
|
185
217
|
else:
|
|
186
218
|
return self._environ.__iter__()
|
|
187
219
|
|
|
188
|
-
def __len__(self):
|
|
220
|
+
def __len__(self) -> int:
|
|
189
221
|
return len(dict(self))
|
|
190
222
|
|
|
191
|
-
def __setitem__(self, key, value):
|
|
223
|
+
def __setitem__(self, key: str, value: str) -> None:
|
|
192
224
|
ctx = get()
|
|
193
225
|
if ctx is not None:
|
|
194
226
|
ctx.env_overrides[key] = value
|
|
195
227
|
else:
|
|
196
228
|
self._environ.__setitem__(key, value)
|
|
197
229
|
|
|
198
|
-
def __delitem__(self, key):
|
|
230
|
+
def __delitem__(self, key: str) -> None:
|
|
199
231
|
ctx = get()
|
|
200
232
|
if ctx is not None:
|
|
201
233
|
if key in ctx.env_overrides:
|
|
@@ -211,10 +243,13 @@ class ContextualEnviron(MutableMapping):
|
|
|
211
243
|
else:
|
|
212
244
|
self._environ.__delitem__(key)
|
|
213
245
|
|
|
214
|
-
def __repr__(self):
|
|
215
|
-
|
|
246
|
+
def __repr__(self) -> str:
|
|
247
|
+
# Adapted from os._Environ.__repr__
|
|
248
|
+
formatted_items = ', '.join(
|
|
249
|
+
f'{key!r}: {value!r}' for key, value in self.items())
|
|
250
|
+
return f'ctx_environ({{{formatted_items}}})'
|
|
216
251
|
|
|
217
|
-
def copy(self):
|
|
252
|
+
def copy(self) -> Dict[str, str]:
|
|
218
253
|
copied = self._environ.copy()
|
|
219
254
|
ctx = get()
|
|
220
255
|
if ctx is not None:
|
|
@@ -225,7 +260,7 @@ class ContextualEnviron(MutableMapping):
|
|
|
225
260
|
copied[key] = ctx.env_overrides[key]
|
|
226
261
|
return copied
|
|
227
262
|
|
|
228
|
-
def setdefault(self, key, default
|
|
263
|
+
def setdefault(self, key: str, default: str) -> str:
|
|
229
264
|
return self._environ.setdefault(key, default)
|
|
230
265
|
|
|
231
266
|
def __ior__(self, other):
|
|
@@ -260,27 +295,67 @@ class Popen(subprocess.Popen):
|
|
|
260
295
|
super().__init__(*args, env=env, **kwargs)
|
|
261
296
|
|
|
262
297
|
|
|
263
|
-
|
|
298
|
+
P = ParamSpec('P')
|
|
299
|
+
T = TypeVar('T')
|
|
264
300
|
|
|
265
301
|
|
|
266
|
-
def contextual(func:
|
|
302
|
+
def contextual(func: Callable[P, T]) -> Callable[P, T]:
|
|
267
303
|
"""Decorator to initialize a context before executing the function.
|
|
268
304
|
|
|
269
|
-
If a context is already initialized, this decorator will
|
|
270
|
-
|
|
305
|
+
If a context is already initialized, this decorator will create a new
|
|
306
|
+
context that inherits the values from the existing context.
|
|
271
307
|
"""
|
|
272
308
|
|
|
273
309
|
@functools.wraps(func)
|
|
274
|
-
def wrapper(*args, **kwargs):
|
|
275
|
-
|
|
276
|
-
|
|
310
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
311
|
+
original_ctx = get()
|
|
312
|
+
initialize(original_ctx)
|
|
313
|
+
ctx = get()
|
|
314
|
+
cleanup_after_await = False
|
|
315
|
+
|
|
316
|
+
def cleanup():
|
|
317
|
+
try:
|
|
318
|
+
if ctx is not None:
|
|
319
|
+
ctx.cleanup()
|
|
320
|
+
finally:
|
|
321
|
+
# Note: _CONTEXT.reset() is not reliable - may fail with
|
|
322
|
+
# ValueError: <Token ... at ...> was created in a different
|
|
323
|
+
# Context
|
|
324
|
+
# We must make sure this happens because otherwise we may try to
|
|
325
|
+
# write to the wrong log.
|
|
326
|
+
_CONTEXT.set(original_ctx)
|
|
327
|
+
|
|
328
|
+
# There are two cases:
|
|
329
|
+
# 1. The function is synchronous (that is, return type is not awaitable)
|
|
330
|
+
# In this case, we use a finally block to cleanup the context.
|
|
331
|
+
# 2. The function is asynchronous (that is, return type is awaitable)
|
|
332
|
+
# In this case, we need to construct an async def wrapper and await
|
|
333
|
+
# the value, then call the cleanup function in the finally block.
|
|
334
|
+
|
|
335
|
+
async def await_with_cleanup(awaitable):
|
|
336
|
+
try:
|
|
337
|
+
return await awaitable
|
|
338
|
+
finally:
|
|
339
|
+
cleanup()
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
ret = func(*args, **kwargs)
|
|
343
|
+
if inspect.isawaitable(ret):
|
|
344
|
+
cleanup_after_await = True
|
|
345
|
+
return await_with_cleanup(ret)
|
|
346
|
+
else:
|
|
347
|
+
return ret
|
|
348
|
+
finally:
|
|
349
|
+
if not cleanup_after_await:
|
|
350
|
+
cleanup()
|
|
277
351
|
|
|
278
|
-
return
|
|
352
|
+
return wrapper
|
|
279
353
|
|
|
280
354
|
|
|
281
|
-
def initialize():
|
|
355
|
+
def initialize(base_context: Optional[Context] = None) -> None:
|
|
282
356
|
"""Initialize the current SkyPilot context."""
|
|
283
|
-
|
|
357
|
+
new_context = base_context.copy() if base_context is not None else Context()
|
|
358
|
+
_CONTEXT.set(new_context)
|
|
284
359
|
|
|
285
360
|
|
|
286
361
|
class _ContextualStream:
|
sky/utils/controller_utils.py
CHANGED
|
@@ -620,15 +620,16 @@ def get_controller_resources(
|
|
|
620
620
|
controller_resources_to_use: resources.Resources = list(
|
|
621
621
|
controller_resources)[0]
|
|
622
622
|
|
|
623
|
-
|
|
623
|
+
controller_handle = global_user_state.get_handle_from_cluster_name(
|
|
624
624
|
controller.value.cluster_name)
|
|
625
|
-
if
|
|
626
|
-
|
|
627
|
-
if handle is not None:
|
|
625
|
+
if controller_handle is not None:
|
|
626
|
+
if controller_handle is not None:
|
|
628
627
|
# Use the existing resources, but override the autostop config with
|
|
629
628
|
# the one currently specified in the config.
|
|
630
|
-
controller_resources_to_use =
|
|
631
|
-
|
|
629
|
+
controller_resources_to_use = (
|
|
630
|
+
controller_handle.launched_resources.copy(
|
|
631
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
|
632
|
+
)
|
|
632
633
|
|
|
633
634
|
# If the controller and replicas are from the same cloud (and region/zone),
|
|
634
635
|
# it should provide better connectivity. We will let the controller choose
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -201,6 +201,7 @@ def add_column_to_table_alembic(
|
|
|
201
201
|
server_default: Optional[str] = None,
|
|
202
202
|
copy_from: Optional[str] = None,
|
|
203
203
|
value_to_replace_existing_entries: Optional[Any] = None,
|
|
204
|
+
index: Optional[bool] = None,
|
|
204
205
|
):
|
|
205
206
|
"""Add a column to a table using Alembic operations.
|
|
206
207
|
|
|
@@ -215,6 +216,8 @@ def add_column_to_table_alembic(
|
|
|
215
216
|
copy_from: Column name to copy values from (for existing rows)
|
|
216
217
|
value_to_replace_existing_entries: Default value for existing NULL
|
|
217
218
|
entries
|
|
219
|
+
index: If True, create an index on this column. If None, no index
|
|
220
|
+
is created.
|
|
218
221
|
"""
|
|
219
222
|
from alembic import op # pylint: disable=import-outside-toplevel
|
|
220
223
|
|
|
@@ -222,7 +225,8 @@ def add_column_to_table_alembic(
|
|
|
222
225
|
# Create the column with server_default if provided
|
|
223
226
|
column = sqlalchemy.Column(column_name,
|
|
224
227
|
column_type,
|
|
225
|
-
server_default=server_default
|
|
228
|
+
server_default=server_default,
|
|
229
|
+
index=index)
|
|
226
230
|
op.add_column(table_name, column)
|
|
227
231
|
|
|
228
232
|
# Handle data migration
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -17,7 +17,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
17
17
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
18
18
|
|
|
19
19
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
20
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
20
|
+
GLOBAL_USER_STATE_VERSION = '009'
|
|
21
21
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
22
22
|
|
|
23
23
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|