skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
@@ -79,7 +79,7 @@ install_requires = [
79
79
  # Required for API server metrics
80
80
  'prometheus_client>=0.8.0',
81
81
  'passlib',
82
- 'bcrypt',
82
+ 'bcrypt==4.0.1',
83
83
  'pyjwt',
84
84
  'gitpython',
85
85
  'types-paramiko',
@@ -112,6 +112,7 @@ server_dependencies = [
112
112
  GRPC,
113
113
  PROTOBUF,
114
114
  'aiosqlite',
115
+ 'greenlet',
115
116
  ]
116
117
 
117
118
  local_ray = [
@@ -192,7 +193,9 @@ extras_require: Dict[str, List[str]] = {
192
193
  'remote': remote,
193
194
  # For the container registry auth api. Reference:
194
195
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
195
- 'runpod': ['runpod>=1.6.1'],
196
+ # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
197
+ # stdlib provides tomllib; on lower versions we depend on tomli explicitly.
198
+ 'runpod': ['runpod>=1.6.1', 'tomli; python_version < "3.11"'],
196
199
  'fluidstack': [], # No dependencies needed for fluidstack
197
200
  'cudo': ['cudo-compute>=0.1.10'],
198
201
  'paperspace': [], # No dependencies needed for paperspace
sky/setup_files/setup.py CHANGED
@@ -148,47 +148,47 @@ if os.path.exists(readme_filepath):
148
148
  long_description = io.open(readme_filepath, 'r', encoding='utf-8').read()
149
149
  long_description = parse_readme(long_description)
150
150
 
151
- atexit.register(revert_commit_hash)
152
- replace_commit_hash()
153
-
154
- setuptools.setup(
155
- # NOTE: this affects the package.whl wheel name. When changing this (if
156
- # ever), you must grep for '.whl' and change all corresponding wheel paths
157
- # (templates/*.j2 and wheel_utils.py).
158
- name='skypilot-nightly',
159
- version=find_version(),
160
- packages=setuptools.find_packages(),
161
- author='SkyPilot Team',
162
- license='Apache 2.0',
163
- readme='README.md',
164
- description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
- long_description=long_description,
166
- long_description_content_type='text/markdown',
167
- setup_requires=['wheel'],
168
- requires_python='>=3.7',
169
- install_requires=dependencies['install_requires'],
170
- extras_require=dependencies['extras_require'],
171
- entry_points={
172
- 'console_scripts': ['sky = sky.cli:cli'],
173
- },
174
- include_package_data=True,
175
- classifiers=[
176
- 'Programming Language :: Python :: 3.7',
177
- 'Programming Language :: Python :: 3.8',
178
- 'Programming Language :: Python :: 3.9',
179
- 'Programming Language :: Python :: 3.10',
180
- 'Programming Language :: Python :: 3.11',
181
- 'Programming Language :: Python :: 3.12',
182
- 'Programming Language :: Python :: 3.13',
183
- 'License :: OSI Approved :: Apache Software License',
184
- 'Operating System :: OS Independent',
185
- 'Topic :: Software Development :: Libraries :: Python Modules',
186
- 'Topic :: System :: Distributed Computing',
187
- ],
188
- project_urls={
189
- 'Homepage': 'https://github.com/skypilot-org/skypilot',
190
- 'Issues': 'https://github.com/skypilot-org/skypilot/issues',
191
- 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
192
- 'Documentation': 'https://docs.skypilot.co/',
193
- },
194
- )
151
+ if __name__ == '__main__':
152
+ atexit.register(revert_commit_hash)
153
+ replace_commit_hash()
154
+ setuptools.setup(
155
+ # NOTE: this affects the package.whl wheel name. When changing this (if
156
+ # ever), you must grep for '.whl' and change all corresponding wheel paths
157
+ # (templates/*.j2 and wheel_utils.py).
158
+ name='skypilot-nightly',
159
+ version=find_version(),
160
+ packages=setuptools.find_packages(),
161
+ author='SkyPilot Team',
162
+ license='Apache 2.0',
163
+ readme='README.md',
164
+ description='SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.',
165
+ long_description=long_description,
166
+ long_description_content_type='text/markdown',
167
+ setup_requires=['wheel'],
168
+ requires_python='>=3.7',
169
+ install_requires=dependencies['install_requires'],
170
+ extras_require=dependencies['extras_require'],
171
+ entry_points={
172
+ 'console_scripts': ['sky = sky.cli:cli'],
173
+ },
174
+ include_package_data=True,
175
+ classifiers=[
176
+ 'Programming Language :: Python :: 3.7',
177
+ 'Programming Language :: Python :: 3.8',
178
+ 'Programming Language :: Python :: 3.9',
179
+ 'Programming Language :: Python :: 3.10',
180
+ 'Programming Language :: Python :: 3.11',
181
+ 'Programming Language :: Python :: 3.12',
182
+ 'Programming Language :: Python :: 3.13',
183
+ 'License :: OSI Approved :: Apache Software License',
184
+ 'Operating System :: OS Independent',
185
+ 'Topic :: Software Development :: Libraries :: Python Modules',
186
+ 'Topic :: System :: Distributed Computing',
187
+ ],
188
+ project_urls={
189
+ 'Homepage': 'https://github.com/skypilot-org/skypilot',
190
+ 'Issues': 'https://github.com/skypilot-org/skypilot/issues',
191
+ 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
192
+ 'Documentation': 'https://docs.skypilot.co/',
193
+ },
194
+ )
sky/skylet/constants.py CHANGED
@@ -57,6 +57,9 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
57
57
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
58
58
  # uv is used for venv and pip, much faster than python implementations.
59
59
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
60
+ # set UV_SYSTEM_PYTHON to false in case the
61
+ # user provided docker image set it to true.
62
+ # unset PYTHONPATH in case the user provided docker image set it.
60
63
  SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
61
64
  f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
62
65
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
@@ -97,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
97
100
  # cluster yaml is updated.
98
101
  #
99
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
100
- SKYLET_VERSION = '18'
103
+ SKYLET_VERSION = '19'
101
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
102
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
103
106
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/skylet/events.py CHANGED
@@ -11,6 +11,7 @@ import psutil
11
11
  from sky import clouds
12
12
  from sky import sky_logging
13
13
  from sky.backends import cloud_vm_ray_backend
14
+ from sky.jobs import constants as managed_job_constants
14
15
  from sky.jobs import scheduler
15
16
  from sky.jobs import state as managed_job_state
16
17
  from sky.jobs import utils as managed_job_utils
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
21
22
  from sky.usage import usage_lib
22
23
  from sky.utils import cluster_utils
23
24
  from sky.utils import registry
25
+ from sky.utils import subprocess_utils
24
26
  from sky.utils import ux_utils
25
27
  from sky.utils import yaml_utils
26
28
 
@@ -74,6 +76,46 @@ class ManagedJobEvent(SkyletEvent):
74
76
  EVENT_INTERVAL_SECONDS = 300
75
77
 
76
78
  def _run(self):
79
+ if not os.path.exists(
80
+ os.path.expanduser(
81
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)):
82
+ # Note: since the skylet is started before the user setup (in
83
+ # jobs-controller.yaml.j2) runs, it's possible that we hit this
84
+ # before the indicator file is written. However, since we will wait
85
+ # EVENT_INTERVAL_SECONDS before the first run, this should be very
86
+ # unlikely.
87
+ logger.info('No jobs controller indicator file found.')
88
+ all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
89
+ if not all_job_ids:
90
+ logger.info('No jobs running. Stopping controllers.')
91
+ # TODO(cooperc): Move this to a shared function also called by
92
+ # sdk.api_stop(). (#7229)
93
+ try:
94
+ with open(os.path.expanduser(
95
+ scheduler.JOB_CONTROLLER_PID_PATH),
96
+ 'r',
97
+ encoding='utf-8') as f:
98
+ pids = f.read().split('\n')[:-1]
99
+ for pid in pids:
100
+ if subprocess_utils.is_process_alive(
101
+ int(pid.strip())):
102
+ subprocess_utils.kill_children_processes(
103
+ parent_pids=[int(pid.strip())], force=True)
104
+ os.remove(
105
+ os.path.expanduser(scheduler.JOB_CONTROLLER_PID_PATH))
106
+ except FileNotFoundError:
107
+ # its fine we will create it
108
+ pass
109
+ except Exception as e: # pylint: disable=broad-except
110
+ # in case we get perm issues or something is messed up, just
111
+ # ignore it and assume the process is dead
112
+ logger.error(
113
+ f'Error looking at job controller pid file: {e}')
114
+ pass
115
+ logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
116
+ 'indicator file hasn\'t been written yet.')
117
+ return
118
+
77
119
  logger.info('=== Updating managed job status ===')
78
120
  managed_job_utils.update_managed_jobs_statuses()
79
121
  scheduler.maybe_start_controllers()
@@ -36,6 +36,9 @@ setup: |
36
36
  grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
37
37
  {% endif %}
38
38
 
39
+ # This is used by the skylet events to check if we are a jobs controller.
40
+ touch {{job_controller_indicator_file}}
41
+
39
42
  run: |
40
43
  {%- if consolidation_mode_job_id is none %}
41
44
  {{ sky_activate_python_env }}
@@ -510,6 +510,16 @@ available_node_types:
510
510
  valueFrom:
511
511
  fieldRef:
512
512
  fieldPath: metadata.labels['ray-node-type']
513
+ - name: SKYPILOT_POD_CPU_CORE_LIMIT
514
+ valueFrom:
515
+ resourceFieldRef:
516
+ containerName: ray-node
517
+ resource: requests.cpu
518
+ - name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
519
+ valueFrom:
520
+ resourceFieldRef:
521
+ containerName: ray-node
522
+ resource: requests.memory
513
523
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
514
524
  - name: {{ key }}
515
525
  value: {{ value }}
@@ -630,13 +640,6 @@ available_node_types:
630
640
  command: ["/bin/bash", "-c", "--"]
631
641
  args:
632
642
  - |
633
- # For backwards compatibility, we put a marker file in the pod
634
- # to indicate that the pod is running with the changes introduced
635
- # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
636
- # TODO: Remove this marker file and it's usage in setup_commands
637
- # after v0.10.0 release.
638
- touch /tmp/skypilot_is_nimbus
639
-
640
643
  # Helper function to conditionally use sudo
641
644
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
642
645
  prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
@@ -898,15 +901,20 @@ available_node_types:
898
901
  {{ conda_installation_commands }}
899
902
  {{ ray_installation_commands }}
900
903
 
901
- VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
904
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
905
+ # unset PYTHONPATH in case the user provided docker image set it.
906
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
902
907
  # Wait for `patch` package to be installed before applying ray patches
903
908
  until dpkg -l | grep -q "^ii patch "; do
904
909
  sleep 0.1
905
910
  echo "Waiting for patch package to be installed..."
906
911
  done
907
912
  # Apply Ray patches for progress bar fix
908
- ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
909
- VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
913
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
914
+ # unset PYTHONPATH in case the user provided docker image set it.
915
+ # ~/.sky/python_path is seeded by conda_installation_commands
916
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
917
+ $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
910
918
  }
911
919
  touch /tmp/ray_skypilot_installation_complete
912
920
  echo "=== Ray and skypilot installation completed ==="
@@ -1333,18 +1341,16 @@ setup_commands:
1333
1341
  # Wait for SSH setup to complete before proceeding
1334
1342
  if [ -f /tmp/apt_ssh_setup_started ]; then
1335
1343
  echo "=== Logs for asynchronous SSH setup ===";
1336
- [ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
1337
- { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1344
+ ([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
1345
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1338
1346
  [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1339
1347
  fi
1340
1348
 
1341
1349
  echo "=== Logs for asynchronous ray and skypilot installation ===";
1342
- if [ -f /tmp/skypilot_is_nimbus ]; then
1343
- echo "=== Logs for asynchronous ray and skypilot installation ===";
1344
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
1345
- { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1346
- [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1347
- fi
1350
+ ([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
1351
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1352
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1353
+
1348
1354
  end_epoch=$(date +%s);
1349
1355
  echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
1350
1356
  start_epoch=$(date +%s);
sky/usage/usage_lib.py CHANGED
@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
14
14
 
15
15
  import sky
16
16
  from sky import sky_logging
17
+ from sky import skypilot_config
17
18
  from sky.adaptors import common as adaptors_common
18
19
  from sky.usage import constants
19
20
  from sky.utils import common_utils
@@ -167,6 +168,7 @@ class UsageMessageToReport(MessageToReport):
167
168
  self.runtimes: Dict[str, float] = {} # update_runtime
168
169
  self.exception: Optional[str] = None # entrypoint_context
169
170
  self.stacktrace: Optional[str] = None # entrypoint_context
171
+ self.skypilot_config: Optional[Dict[str, Any]] = None
170
172
 
171
173
  # Whether API server is deployed remotely.
172
174
  self.using_remote_api_server: bool = (
@@ -177,6 +179,7 @@ class UsageMessageToReport(MessageToReport):
177
179
  self.client_entrypoint = common_utils.get_current_client_entrypoint(
178
180
  msg)
179
181
  self.entrypoint = msg
182
+ self.skypilot_config = dict(skypilot_config.to_dict())
180
183
 
181
184
  def set_internal(self):
182
185
  self.internal = True
@@ -11,6 +11,7 @@ from sky.utils import common_utils
11
11
  from sky.utils import log_utils
12
12
  from sky.utils import resources_utils
13
13
  from sky.utils import status_lib
14
+ from sky.utils import ux_utils
14
15
 
15
16
  if typing.TYPE_CHECKING:
16
17
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
105
106
 
106
107
  if query_clusters:
107
108
  cluster_names = {record['name'] for record in cluster_records}
108
- not_found_clusters = [
109
- repr(cluster)
110
- for cluster in query_clusters
111
- if cluster not in cluster_names
112
- ]
109
+ not_found_clusters = ux_utils.get_non_matched_query(
110
+ query_clusters, cluster_names)
111
+ not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
113
112
  if not_found_clusters:
114
113
  cluster_str = 'Cluster'
115
114
  if len(not_found_clusters) > 1:
sky/utils/context.py CHANGED
@@ -2,15 +2,21 @@
2
2
 
3
3
  import asyncio
4
4
  from collections.abc import Mapping
5
- from collections.abc import MutableMapping
6
5
  import contextvars
6
+ import copy
7
7
  import functools
8
+ import inspect
8
9
  import os
9
10
  import pathlib
10
11
  import subprocess
11
12
  import sys
12
- import typing
13
- from typing import Any, Callable, Dict, Optional, TextIO, TypeVar
13
+ from typing import (Callable, Dict, Iterator, MutableMapping, Optional, TextIO,
14
+ TYPE_CHECKING, TypeVar)
15
+
16
+ from typing_extensions import ParamSpec
17
+
18
+ if TYPE_CHECKING:
19
+ from sky.skypilot_config import ConfigContext
14
20
 
15
21
 
16
22
  class Context(object):
@@ -88,7 +94,7 @@ class Context(object):
88
94
  else:
89
95
  self._log_file_handle = open(log_file, 'a', encoding='utf-8')
90
96
  self._log_file = log_file
91
- if original_log_file is not None:
97
+ if original_log_handle is not None:
92
98
  original_log_handle.close()
93
99
  return original_log_file
94
100
 
@@ -102,8 +108,30 @@ class Context(object):
102
108
  for k, v in envs.items():
103
109
  self.env_overrides[k] = v
104
110
 
111
+ def cleanup(self):
112
+ """Clean up the context."""
113
+ if self._log_file_handle is not None:
114
+ self._log_file_handle.close()
115
+ self._log_file_handle = None
116
+
117
+ def copy(self) -> 'Context':
118
+ """Create a copy of the context.
119
+
120
+ Changes to the current context after this call will not affect the copy.
121
+ The new context will get its own handle/fd for the log file.
122
+ The new context will get an independent copy of the env var overrides.
123
+ The new context will get an independent copy of the config context.
124
+ Cancellation of the current context will not be propagated to the copy.
125
+ """
126
+ new_context = Context()
127
+ new_context.redirect_log(self._log_file)
128
+ new_context.env_overrides = self.env_overrides.copy()
129
+ new_context.config_context = copy.deepcopy(self.config_context)
130
+ return new_context
105
131
 
106
- _CONTEXT = contextvars.ContextVar('sky_context', default=None)
132
+
133
+ _CONTEXT = contextvars.ContextVar[Optional[Context]]('sky_context',
134
+ default=None)
107
135
 
108
136
 
109
137
  def get() -> Optional[Context]:
@@ -116,7 +144,7 @@ def get() -> Optional[Context]:
116
144
  return _CONTEXT.get()
117
145
 
118
146
 
119
- class ContextualEnviron(MutableMapping):
147
+ class ContextualEnviron(MutableMapping[str, str]):
120
148
  """Environment variables wrapper with contextual overrides.
121
149
 
122
150
  An instance of ContextualEnviron will typically be used to replace
@@ -155,10 +183,10 @@ class ContextualEnviron(MutableMapping):
155
183
  assert os.environ['FOO'] == 'BAR1'
156
184
  """
157
185
 
158
- def __init__(self, environ):
186
+ def __init__(self, environ: 'os._Environ[str]') -> None:
159
187
  self._environ = environ
160
188
 
161
- def __getitem__(self, key):
189
+ def __getitem__(self, key: str) -> str:
162
190
  ctx = get()
163
191
  if ctx is not None:
164
192
  if key in ctx.env_overrides:
@@ -170,10 +198,10 @@ class ContextualEnviron(MutableMapping):
170
198
  return value
171
199
  return self._environ[key]
172
200
 
173
- def __iter__(self):
174
- ctx = get()
175
- deleted_keys = set()
176
- if ctx is not None:
201
+ def __iter__(self) -> Iterator[str]:
202
+
203
+ def iter_from_context(ctx: Context) -> Iterator[str]:
204
+ deleted_keys = set()
177
205
  for key, value in ctx.env_overrides.items():
178
206
  if value is None:
179
207
  deleted_keys.add(key)
@@ -182,20 +210,24 @@ class ContextualEnviron(MutableMapping):
182
210
  # Deduplicate the keys
183
211
  if key not in ctx.env_overrides and key not in deleted_keys:
184
212
  yield key
213
+
214
+ ctx = get()
215
+ if ctx is not None:
216
+ return iter_from_context(ctx)
185
217
  else:
186
218
  return self._environ.__iter__()
187
219
 
188
- def __len__(self):
220
+ def __len__(self) -> int:
189
221
  return len(dict(self))
190
222
 
191
- def __setitem__(self, key, value):
223
+ def __setitem__(self, key: str, value: str) -> None:
192
224
  ctx = get()
193
225
  if ctx is not None:
194
226
  ctx.env_overrides[key] = value
195
227
  else:
196
228
  self._environ.__setitem__(key, value)
197
229
 
198
- def __delitem__(self, key):
230
+ def __delitem__(self, key: str) -> None:
199
231
  ctx = get()
200
232
  if ctx is not None:
201
233
  if key in ctx.env_overrides:
@@ -211,10 +243,13 @@ class ContextualEnviron(MutableMapping):
211
243
  else:
212
244
  self._environ.__delitem__(key)
213
245
 
214
- def __repr__(self):
215
- return self._environ.__repr__()
246
+ def __repr__(self) -> str:
247
+ # Adapted from os._Environ.__repr__
248
+ formatted_items = ', '.join(
249
+ f'{key!r}: {value!r}' for key, value in self.items())
250
+ return f'ctx_environ({{{formatted_items}}})'
216
251
 
217
- def copy(self):
252
+ def copy(self) -> Dict[str, str]:
218
253
  copied = self._environ.copy()
219
254
  ctx = get()
220
255
  if ctx is not None:
@@ -225,7 +260,7 @@ class ContextualEnviron(MutableMapping):
225
260
  copied[key] = ctx.env_overrides[key]
226
261
  return copied
227
262
 
228
- def setdefault(self, key, default=None):
263
+ def setdefault(self, key: str, default: str) -> str:
229
264
  return self._environ.setdefault(key, default)
230
265
 
231
266
  def __ior__(self, other):
@@ -260,27 +295,67 @@ class Popen(subprocess.Popen):
260
295
  super().__init__(*args, env=env, **kwargs)
261
296
 
262
297
 
263
- F = TypeVar('F', bound=Callable[..., Any])
298
+ P = ParamSpec('P')
299
+ T = TypeVar('T')
264
300
 
265
301
 
266
- def contextual(func: F) -> F:
302
+ def contextual(func: Callable[P, T]) -> Callable[P, T]:
267
303
  """Decorator to initialize a context before executing the function.
268
304
 
269
- If a context is already initialized, this decorator will reset the context,
270
- i.e. all contextual variables set previously will be cleared.
305
+ If a context is already initialized, this decorator will create a new
306
+ context that inherits the values from the existing context.
271
307
  """
272
308
 
273
309
  @functools.wraps(func)
274
- def wrapper(*args, **kwargs):
275
- initialize()
276
- return func(*args, **kwargs)
310
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
311
+ original_ctx = get()
312
+ initialize(original_ctx)
313
+ ctx = get()
314
+ cleanup_after_await = False
315
+
316
+ def cleanup():
317
+ try:
318
+ if ctx is not None:
319
+ ctx.cleanup()
320
+ finally:
321
+ # Note: _CONTEXT.reset() is not reliable - may fail with
322
+ # ValueError: <Token ... at ...> was created in a different
323
+ # Context
324
+ # We must make sure this happens because otherwise we may try to
325
+ # write to the wrong log.
326
+ _CONTEXT.set(original_ctx)
327
+
328
+ # There are two cases:
329
+ # 1. The function is synchronous (that is, return type is not awaitable)
330
+ # In this case, we use a finally block to cleanup the context.
331
+ # 2. The function is asynchronous (that is, return type is awaitable)
332
+ # In this case, we need to construct an async def wrapper and await
333
+ # the value, then call the cleanup function in the finally block.
334
+
335
+ async def await_with_cleanup(awaitable):
336
+ try:
337
+ return await awaitable
338
+ finally:
339
+ cleanup()
340
+
341
+ try:
342
+ ret = func(*args, **kwargs)
343
+ if inspect.isawaitable(ret):
344
+ cleanup_after_await = True
345
+ return await_with_cleanup(ret)
346
+ else:
347
+ return ret
348
+ finally:
349
+ if not cleanup_after_await:
350
+ cleanup()
277
351
 
278
- return typing.cast(F, wrapper)
352
+ return wrapper
279
353
 
280
354
 
281
- def initialize():
355
+ def initialize(base_context: Optional[Context] = None) -> None:
282
356
  """Initialize the current SkyPilot context."""
283
- _CONTEXT.set(Context())
357
+ new_context = base_context.copy() if base_context is not None else Context()
358
+ _CONTEXT.set(new_context)
284
359
 
285
360
 
286
361
  class _ContextualStream:
@@ -620,15 +620,16 @@ def get_controller_resources(
620
620
  controller_resources_to_use: resources.Resources = list(
621
621
  controller_resources)[0]
622
622
 
623
- controller_record = global_user_state.get_cluster_from_name(
623
+ controller_handle = global_user_state.get_handle_from_cluster_name(
624
624
  controller.value.cluster_name)
625
- if controller_record is not None:
626
- handle = controller_record.get('handle', None)
627
- if handle is not None:
625
+ if controller_handle is not None:
626
+ if controller_handle is not None:
628
627
  # Use the existing resources, but override the autostop config with
629
628
  # the one currently specified in the config.
630
- controller_resources_to_use = handle.launched_resources.copy(
631
- autostop=controller_resources_config_copied.get('autostop'))
629
+ controller_resources_to_use = (
630
+ controller_handle.launched_resources.copy(
631
+ autostop=controller_resources_config_copied.get('autostop'))
632
+ )
632
633
 
633
634
  # If the controller and replicas are from the same cloud (and region/zone),
634
635
  # it should provide better connectivity. We will let the controller choose
sky/utils/db/db_utils.py CHANGED
@@ -201,6 +201,7 @@ def add_column_to_table_alembic(
201
201
  server_default: Optional[str] = None,
202
202
  copy_from: Optional[str] = None,
203
203
  value_to_replace_existing_entries: Optional[Any] = None,
204
+ index: Optional[bool] = None,
204
205
  ):
205
206
  """Add a column to a table using Alembic operations.
206
207
 
@@ -215,6 +216,8 @@ def add_column_to_table_alembic(
215
216
  copy_from: Column name to copy values from (for existing rows)
216
217
  value_to_replace_existing_entries: Default value for existing NULL
217
218
  entries
219
+ index: If True, create an index on this column. If None, no index
220
+ is created.
218
221
  """
219
222
  from alembic import op # pylint: disable=import-outside-toplevel
220
223
 
@@ -222,7 +225,8 @@ def add_column_to_table_alembic(
222
225
  # Create the column with server_default if provided
223
226
  column = sqlalchemy.Column(column_name,
224
227
  column_type,
225
- server_default=server_default)
228
+ server_default=server_default,
229
+ index=index)
226
230
  op.add_column(table_name, column)
227
231
 
228
232
  # Handle data migration
@@ -17,7 +17,7 @@ logger = sky_logging.init_logger(__name__)
17
17
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
18
18
 
19
19
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
20
- GLOBAL_USER_STATE_VERSION = '008'
20
+ GLOBAL_USER_STATE_VERSION = '009'
21
21
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
22
22
 
23
23
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'