skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,78 @@
1
+ """Asyncio utilities."""
2
+
3
+ import asyncio
4
+ import functools
5
+ from typing import Set
6
+
7
+ _background_tasks: Set[asyncio.Task] = set()
8
+
9
+
10
+ def shield(func):
11
+ """Shield the decorated async function from cancellation.
12
+
13
+ If the outter coroutine is cancelled, the inner decorated function
14
+ will be protected from cancellation by asyncio.shield(). And we will
15
+ maintain a reference to the the inner task to avoid it get GCed before
16
+ it is done.
17
+
18
+ For example, filelock.AsyncFileLock is not cancellation safe. The
19
+ following code:
20
+
21
+ async def fn_with_lock():
22
+ async with filelock.AsyncFileLock('lock'):
23
+ await asyncio.sleep(1)
24
+
25
+ is equivalent to:
26
+
27
+ # The lock may leak if the cancellation happens in
28
+ # lock.acquire() or lock.release()
29
+ async def fn_with_lock():
30
+ lock = filelock.AsyncFileLock('lock')
31
+ await lock.acquire()
32
+ try:
33
+ await asyncio.sleep(1)
34
+ finally:
35
+ await lock.release()
36
+
37
+ Shilding the function ensures there is no cancellation will happen in the
38
+ function, thus the lock will be released properly:
39
+
40
+ @shield
41
+ async def fn_with_lock()
42
+
43
+ Note that the resource acquisition and release should usually be protected
44
+ in one @shield block but not separately, e.g.:
45
+
46
+ lock = filelock.AsyncFileLock('lock')
47
+
48
+ @shield
49
+ async def acquire():
50
+ await lock.acquire()
51
+
52
+ @shield
53
+ async def release():
54
+ await lock.release()
55
+
56
+ async def fn_with_lock():
57
+ await acquire()
58
+ try:
59
+ do_something()
60
+ finally:
61
+ await release()
62
+
63
+ The above code is not safe because if `fn_with_lock` is cancelled,
64
+ `acquire()` and `release()` will be executed in the background
65
+ concurrently and causes race conditions.
66
+ """
67
+
68
+ @functools.wraps(func)
69
+ async def async_wrapper(*args, **kwargs):
70
+ task = asyncio.create_task(func(*args, **kwargs))
71
+ try:
72
+ return await asyncio.shield(task)
73
+ except asyncio.CancelledError:
74
+ _background_tasks.add(task)
75
+ task.add_done_callback(lambda _: _background_tasks.discard(task))
76
+ raise
77
+
78
+ return async_wrapper
@@ -0,0 +1,153 @@
1
+ """Utils for managing SkyPilot SSH key pairs."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import Tuple
6
+
7
+ import filelock
8
+
9
+ from sky import global_user_state
10
+ from sky import sky_logging
11
+ from sky.utils import common_utils
12
+
13
+ logger = sky_logging.init_logger(__name__)
14
+
15
+ MAX_TRIALS = 64
16
+ # TODO(zhwu): Support user specified key pair.
17
+ # We intentionally not have the ssh key pair to be stored in
18
+ # ~/.sky/api_server/clients, i.e. sky.server.common.API_SERVER_CLIENT_DIR,
19
+ # because ssh key pair need to persist across API server restarts, while
20
+ # the former dir is ephemeral.
21
+ _SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
22
+
23
+
24
+ def get_ssh_key_and_lock_path(user_hash: str) -> Tuple[str, str, str]:
25
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
26
+
27
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix),
28
+ exist_ok=True,
29
+ mode=0o700)
30
+ private_key_path = os.path.join(user_ssh_key_prefix, 'sky-key')
31
+ public_key_path = os.path.join(user_ssh_key_prefix, 'sky-key.pub')
32
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-sky-key.lock')
33
+ return private_key_path, public_key_path, lock_path
34
+
35
+
36
+ def _generate_rsa_key_pair() -> Tuple[str, str]:
37
+ # Keep the import of the cryptography local to avoid expensive
38
+ # third-party imports when not needed.
39
+ # pylint: disable=import-outside-toplevel
40
+ from cryptography.hazmat.backends import default_backend
41
+ from cryptography.hazmat.primitives import serialization
42
+ from cryptography.hazmat.primitives.asymmetric import rsa
43
+
44
+ key = rsa.generate_private_key(backend=default_backend(),
45
+ public_exponent=65537,
46
+ key_size=2048)
47
+
48
+ private_key = key.private_bytes(
49
+ encoding=serialization.Encoding.PEM,
50
+ format=serialization.PrivateFormat.TraditionalOpenSSL,
51
+ encryption_algorithm=serialization.NoEncryption()).decode(
52
+ 'utf-8').strip()
53
+
54
+ public_key = key.public_key().public_bytes(
55
+ serialization.Encoding.OpenSSH,
56
+ serialization.PublicFormat.OpenSSH).decode('utf-8').strip()
57
+
58
+ return public_key, private_key
59
+
60
+
61
+ def _save_key_pair(private_key_path: str, public_key_path: str,
62
+ private_key: str, public_key: str) -> None:
63
+ key_dir = os.path.dirname(private_key_path)
64
+ os.makedirs(key_dir, exist_ok=True, mode=0o700)
65
+
66
+ with open(
67
+ private_key_path,
68
+ 'w',
69
+ encoding='utf-8',
70
+ opener=functools.partial(os.open, mode=0o600),
71
+ ) as f:
72
+ f.write(private_key)
73
+
74
+ with open(public_key_path,
75
+ 'w',
76
+ encoding='utf-8',
77
+ opener=functools.partial(os.open, mode=0o644)) as f:
78
+ f.write(public_key)
79
+
80
+
81
+ def get_or_generate_keys() -> Tuple[str, str]:
82
+ """Returns the absolute private and public key paths."""
83
+ user_hash = common_utils.get_user_hash()
84
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path(
85
+ user_hash)
86
+ private_key_path = os.path.expanduser(private_key_path)
87
+ public_key_path = os.path.expanduser(public_key_path)
88
+ lock_path = os.path.expanduser(lock_path)
89
+
90
+ lock_dir = os.path.dirname(lock_path)
91
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
92
+ # as the ssh configs will be written to this folder as well in
93
+ # backend_utils.SSHConfigHelper
94
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
95
+ with filelock.FileLock(lock_path, timeout=10):
96
+ if not os.path.exists(private_key_path):
97
+ ssh_public_key, ssh_private_key, exists = (
98
+ global_user_state.get_ssh_keys(user_hash))
99
+ if not exists:
100
+ ssh_public_key, ssh_private_key = _generate_rsa_key_pair()
101
+ global_user_state.set_ssh_keys(user_hash, ssh_public_key,
102
+ ssh_private_key)
103
+ _save_key_pair(private_key_path, public_key_path, ssh_private_key,
104
+ ssh_public_key)
105
+ assert os.path.exists(public_key_path), (
106
+ 'Private key found, but associated public key '
107
+ f'{public_key_path} does not exist.')
108
+ return private_key_path, public_key_path
109
+
110
+
111
+ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
112
+ """Creates the ssh key files from the database.
113
+
114
+ Returns:
115
+ True if the ssh key files are created successfully, False otherwise.
116
+ """
117
+ # Assume private key path is in the format of
118
+ # ~/.sky/clients/<user_hash>/ssh/sky-key
119
+ separated_path = os.path.normpath(private_key_path).split(os.path.sep)
120
+ assert separated_path[-1] == 'sky-key'
121
+ assert separated_path[-2] == 'ssh'
122
+ user_hash = separated_path[-3]
123
+
124
+ private_key_path_generated, public_key_path, lock_path = (
125
+ get_ssh_key_and_lock_path(user_hash))
126
+ assert private_key_path == os.path.expanduser(private_key_path_generated), (
127
+ f'Private key path {private_key_path} does not '
128
+ 'match the generated path '
129
+ f'{os.path.expanduser(private_key_path_generated)}')
130
+ private_key_path = os.path.expanduser(private_key_path)
131
+ public_key_path = os.path.expanduser(public_key_path)
132
+ lock_path = os.path.expanduser(lock_path)
133
+ lock_dir = os.path.dirname(lock_path)
134
+
135
+ if os.path.exists(private_key_path) and os.path.exists(public_key_path):
136
+ return True
137
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
138
+ # as the ssh configs will be written to this folder as well in
139
+ # backend_utils.SSHConfigHelper
140
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
141
+ with filelock.FileLock(lock_path, timeout=10):
142
+ if not os.path.exists(private_key_path):
143
+ ssh_public_key, ssh_private_key, exists = (
144
+ global_user_state.get_ssh_keys(user_hash))
145
+ if not exists:
146
+ logger.debug(f'SSH keys not found for user {user_hash}')
147
+ return False
148
+ _save_key_pair(private_key_path, public_key_path, ssh_private_key,
149
+ ssh_public_key)
150
+ assert os.path.exists(public_key_path), (
151
+ 'Private key found, but associated public key '
152
+ f'{public_key_path} does not exist.')
153
+ return True
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
282
282
  if resources_str_full is not None:
283
283
  resources_str = resources_str_full
284
284
  if resources_str is None:
285
- resources_str = resources_utils.get_readable_resources_repr(
286
- handle, simplify=truncate)
285
+ resources_str_simple, resources_str_full = (
286
+ resources_utils.get_readable_resources_repr(
287
+ handle, simplified_only=truncate))
288
+ if truncate:
289
+ resources_str = resources_str_simple
290
+ else:
291
+ assert resources_str_full is not None
292
+ resources_str = resources_str_full
287
293
 
288
294
  return resources_str
289
295
  return '-'
@@ -14,6 +14,7 @@ from sky import exceptions
14
14
  from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.skylet import log_lib
17
+ from sky.utils import auth_utils
17
18
  from sky.utils import common_utils
18
19
  from sky.utils import context_utils
19
20
  from sky.utils import control_master_utils
@@ -649,6 +650,8 @@ class SSHCommandRunner(CommandRunner):
649
650
  self.disable_control_master = (
650
651
  disable_control_master or
651
652
  control_master_utils.should_disable_control_master())
653
+ # ensure the ssh key files are created from the database
654
+ auth_utils.create_ssh_key_files_from_db(ssh_private_key)
652
655
  if docker_user is not None:
653
656
  assert port is None or port == 22, (
654
657
  f'port must be None or 22 for docker_user, got {port}.')
@@ -961,8 +964,16 @@ class KubernetesCommandRunner(CommandRunner):
961
964
  kubectl_args = [
962
965
  '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
963
966
  ]
967
+ # The same logic to either set `--context` to the k8s context where
968
+ # the sky cluster is hosted, or `--kubeconfig` to /dev/null for
969
+ # in-cluster k8s is used below in the `run()` method.
964
970
  if self.context:
965
971
  kubectl_args += ['--context', self.context]
972
+ # If context is none, it means the cluster is hosted on in-cluster k8s.
973
+ # In this case, we need to set KUBECONFIG to /dev/null to avoid looking
974
+ # for the cluster in whatever active context is set in the kubeconfig.
975
+ else:
976
+ kubectl_args += ['--kubeconfig', '/dev/null']
966
977
  local_port, remote_port = port_forward[0]
967
978
  local_port_str = f'{local_port}' if local_port is not None else ''
968
979
 
sky/utils/common.py CHANGED
@@ -31,7 +31,7 @@ JOB_CONTROLLER_NAME: str
31
31
  def refresh_server_id() -> None:
32
32
  """Refresh the server id.
33
33
 
34
- This function is used to ensure the server id is read from the authorative
34
+ This function is used to ensure the server id is read from the authoritative
35
35
  source.
36
36
  """
37
37
  global SERVER_ID
@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
42
42
  JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
43
43
 
44
44
 
45
+ # TODO(kevin): Remove this side effect and have callers call
46
+ # refresh_server_id() explicitly as needed.
45
47
  refresh_server_id()
46
48
 
47
49
 
sky/utils/common_utils.py CHANGED
@@ -265,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
265
265
 
266
266
  class Backoff:
267
267
  """Exponential backoff with jittering."""
268
- MULTIPLIER = 1.6
269
268
  JITTER = 0.4
270
269
 
271
- def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
270
+ def __init__(self,
271
+ initial_backoff: float = 5,
272
+ max_backoff_factor: int = 5,
273
+ multiplier: float = 1.6):
272
274
  self._initial = True
273
275
  self._backoff = 0.0
274
276
  self._initial_backoff = initial_backoff
277
+ self._multiplier = multiplier
275
278
  self._max_backoff = max_backoff_factor * self._initial_backoff
276
279
 
277
280
  # https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
@@ -283,7 +286,7 @@ class Backoff:
283
286
  self._initial = False
284
287
  self._backoff = min(self._initial_backoff, self._max_backoff)
285
288
  else:
286
- self._backoff = min(self._backoff * self.MULTIPLIER,
289
+ self._backoff = min(self._backoff * self._multiplier,
287
290
  self._max_backoff)
288
291
  self._backoff += random.uniform(-self.JITTER * self._backoff,
289
292
  self.JITTER * self._backoff)
@@ -1112,7 +1115,7 @@ def release_memory():
1112
1115
  gc.collect()
1113
1116
  if sys.platform.startswith('linux'):
1114
1117
  # Will fail on musl (alpine), but at least it works on our
1115
- # offical docker images.
1118
+ # official docker images.
1116
1119
  libc = ctypes.CDLL('libc.so.6')
1117
1120
  return libc.malloc_trim(0)
1118
1121
  return 0
sky/utils/context.py CHANGED
@@ -5,13 +5,12 @@ from collections.abc import Mapping
5
5
  import contextvars
6
6
  import copy
7
7
  import functools
8
- import inspect
9
8
  import os
10
9
  import pathlib
11
10
  import subprocess
12
11
  import sys
13
- from typing import (Callable, Dict, Iterator, MutableMapping, Optional, TextIO,
14
- TYPE_CHECKING, TypeVar)
12
+ from typing import (Any, Callable, Coroutine, Dict, Iterator, MutableMapping,
13
+ Optional, TextIO, TYPE_CHECKING, TypeVar)
15
14
 
16
15
  from typing_extensions import ParamSpec
17
16
 
@@ -19,7 +18,7 @@ if TYPE_CHECKING:
19
18
  from sky.skypilot_config import ConfigContext
20
19
 
21
20
 
22
- class Context(object):
21
+ class SkyPilotContext(object):
23
22
  """SkyPilot typed context vars for threads and coroutines.
24
23
 
25
24
  This is a wrapper around `contextvars.ContextVar` that provides a typed
@@ -114,7 +113,14 @@ class Context(object):
114
113
  self._log_file_handle.close()
115
114
  self._log_file_handle = None
116
115
 
117
- def copy(self) -> 'Context':
116
+ def __enter__(self):
117
+ return self
118
+
119
+ def __exit__(self, exc_type, exc_val, exc_tb):
120
+ del exc_type, exc_val, exc_tb
121
+ self.cleanup()
122
+
123
+ def copy(self) -> 'SkyPilotContext':
118
124
  """Create a copy of the context.
119
125
 
120
126
  Changes to the current context after this call will not affect the copy.
@@ -123,18 +129,18 @@ class Context(object):
123
129
  The new context will get an independent copy of the config context.
124
130
  Cancellation of the current context will not be propagated to the copy.
125
131
  """
126
- new_context = Context()
132
+ new_context = SkyPilotContext()
127
133
  new_context.redirect_log(self._log_file)
128
134
  new_context.env_overrides = self.env_overrides.copy()
129
135
  new_context.config_context = copy.deepcopy(self.config_context)
130
136
  return new_context
131
137
 
132
138
 
133
- _CONTEXT = contextvars.ContextVar[Optional[Context]]('sky_context',
134
- default=None)
139
+ _CONTEXT = contextvars.ContextVar[Optional[SkyPilotContext]]('sky_context',
140
+ default=None)
135
141
 
136
142
 
137
- def get() -> Optional[Context]:
143
+ def get() -> Optional[SkyPilotContext]:
138
144
  """Get the current SkyPilot context.
139
145
 
140
146
  If the context is not initialized, get() will return None. This helps
@@ -200,7 +206,7 @@ class ContextualEnviron(MutableMapping[str, str]):
200
206
 
201
207
  def __iter__(self) -> Iterator[str]:
202
208
 
203
- def iter_from_context(ctx: Context) -> Iterator[str]:
209
+ def iter_from_context(ctx: SkyPilotContext) -> Iterator[str]:
204
210
  deleted_keys = set()
205
211
  for key, value in ctx.env_overrides.items():
206
212
  if value is None:
@@ -311,56 +317,56 @@ def contextual(func: Callable[P, T]) -> Callable[P, T]:
311
317
  context that inherits the values from the existing context.
312
318
  """
313
319
 
320
+ def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
321
+ # Within the new contextvars Context, set up the SkyPilotContext.
322
+ original_ctx = get()
323
+ with initialize(original_ctx):
324
+ return func(*args, **kwargs)
325
+
314
326
  @functools.wraps(func)
315
327
  def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
328
+ # Create a copy of the current contextvars Context so that setting the
329
+ # SkyPilotContext does not affect the caller's context in async
330
+ # environments.
331
+ context = contextvars.copy_context()
332
+ return context.run(run_in_context, *args, **kwargs)
333
+
334
+ return wrapper
335
+
336
+
337
+ def contextual_async(
338
+ func: Callable[P, Coroutine[Any, Any, T]]
339
+ ) -> Callable[P, Coroutine[Any, Any, T]]:
340
+ """Decorator to initialize a context before executing the function.
341
+
342
+ If a context is already initialized, this decorator will create a new
343
+ context that inherits the values from the existing context.
344
+ """
345
+
346
+ async def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
347
+ # Within the new contextvars Context, set up the SkyPilotContext.
316
348
  original_ctx = get()
317
- initialize(original_ctx)
318
- ctx = get()
319
- cleanup_after_await = False
320
-
321
- def cleanup():
322
- try:
323
- if ctx is not None:
324
- ctx.cleanup()
325
- finally:
326
- # Note: _CONTEXT.reset() is not reliable - may fail with
327
- # ValueError: <Token ... at ...> was created in a different
328
- # Context
329
- # We must make sure this happens because otherwise we may try to
330
- # write to the wrong log.
331
- _CONTEXT.set(original_ctx)
332
-
333
- # There are two cases:
334
- # 1. The function is synchronous (that is, return type is not awaitable)
335
- # In this case, we use a finally block to cleanup the context.
336
- # 2. The function is asynchronous (that is, return type is awaitable)
337
- # In this case, we need to construct an async def wrapper and await
338
- # the value, then call the cleanup function in the finally block.
339
-
340
- async def await_with_cleanup(awaitable):
341
- try:
342
- return await awaitable
343
- finally:
344
- cleanup()
345
-
346
- try:
347
- ret = func(*args, **kwargs)
348
- if inspect.isawaitable(ret):
349
- cleanup_after_await = True
350
- return await_with_cleanup(ret)
351
- else:
352
- return ret
353
- finally:
354
- if not cleanup_after_await:
355
- cleanup()
349
+ with initialize(original_ctx):
350
+ return await func(*args, **kwargs)
351
+
352
+ @functools.wraps(func)
353
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
354
+ # Create a copy of the current contextvars Context so that setting the
355
+ # SkyPilotContext does not affect the caller's context in async
356
+ # environments.
357
+ context = contextvars.copy_context()
358
+ return await context.run(run_in_context, *args, **kwargs)
356
359
 
357
360
  return wrapper
358
361
 
359
362
 
360
- def initialize(base_context: Optional[Context] = None) -> None:
363
+ def initialize(
364
+ base_context: Optional[SkyPilotContext] = None) -> SkyPilotContext:
361
365
  """Initialize the current SkyPilot context."""
362
- new_context = base_context.copy() if base_context is not None else Context()
366
+ new_context = base_context.copy(
367
+ ) if base_context is not None else SkyPilotContext()
363
368
  _CONTEXT.set(new_context)
369
+ return new_context
364
370
 
365
371
 
366
372
  class _ContextualStream:
@@ -1,5 +1,6 @@
1
1
  """Utilities for SkyPilot context."""
2
2
  import asyncio
3
+ import concurrent.futures
3
4
  import contextvars
4
5
  import functools
5
6
  import io
@@ -7,6 +8,7 @@ import multiprocessing
7
8
  import os
8
9
  import subprocess
9
10
  import sys
11
+ import time
10
12
  import typing
11
13
  from typing import Any, Callable, IO, Optional, Tuple, TypeVar
12
14
 
@@ -17,6 +19,9 @@ from sky.utils import context
17
19
  from sky.utils import subprocess_utils
18
20
 
19
21
  StreamHandler = Callable[[IO[Any], IO[Any]], str]
22
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS = 0.5
23
+
24
+ logger = sky_logging.init_logger(__name__)
20
25
 
21
26
 
22
27
  # TODO(aylei): call hijack_sys_attrs() proactivly in module init at server-side
@@ -43,6 +48,7 @@ def hijack_sys_attrs():
43
48
 
44
49
  def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
45
50
  """Passthrough the stream from the process to the output stream"""
51
+ last_flush_time = time.time()
46
52
  wrapped = io.TextIOWrapper(in_stream,
47
53
  encoding='utf-8',
48
54
  newline='',
@@ -52,14 +58,23 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
52
58
  line = wrapped.readline()
53
59
  if line:
54
60
  out_stream.write(line)
55
- out_stream.flush()
61
+
62
+ # Flush based on timeout instead of on every line
63
+ current_time = time.time()
64
+ if (current_time - last_flush_time >=
65
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS):
66
+ out_stream.flush()
67
+ last_flush_time = current_time
56
68
  else:
57
69
  break
70
+
71
+ # Final flush to ensure all data is written
72
+ out_stream.flush()
58
73
  return ''
59
74
 
60
75
 
61
76
  def pipe_and_wait_process(
62
- ctx: context.Context,
77
+ ctx: context.SkyPilotContext,
63
78
  proc: subprocess.Popen,
64
79
  poll_interval: float = 0.5,
65
80
  cancel_callback: Optional[Callable[[], None]] = None,
@@ -112,7 +127,7 @@ def pipe_and_wait_process(
112
127
  return stdout, stderr
113
128
 
114
129
 
115
- def wait_process(ctx: context.Context,
130
+ def wait_process(ctx: context.SkyPilotContext,
116
131
  proc: subprocess.Popen,
117
132
  poll_interval: float = 0.5,
118
133
  cancel_callback: Optional[Callable[[], None]] = None):
@@ -191,14 +206,17 @@ def to_thread(func: Callable[P, T], /, *args: P.args,
191
206
 
192
207
  This is same as asyncio.to_thread added in python 3.9
193
208
  """
209
+ return to_thread_with_executor(None, func, *args, **kwargs)
210
+
211
+
212
+ def to_thread_with_executor(executor: Optional[concurrent.futures.Executor],
213
+ func: Callable[P, T], /, *args: P.args,
214
+ **kwargs: P.kwargs) -> 'asyncio.Future[T]':
215
+ """Asynchronously run function *func* in a separate thread with
216
+ a custom executor."""
217
+
194
218
  loop = asyncio.get_running_loop()
195
- # This is critical to pass the current coroutine context to the new thread
196
219
  pyctx = contextvars.copy_context()
197
- func_call: Callable[..., T] = functools.partial(
198
- # partial deletes arguments type and thus can't figure out the return
199
- # type of pyctx.run
200
- pyctx.run, # type: ignore
201
- func,
202
- *args,
203
- **kwargs)
204
- return loop.run_in_executor(None, func_call)
220
+ func_call: Callable[..., T] = functools.partial(pyctx.run, func, *args,
221
+ **kwargs)
222
+ return loop.run_in_executor(executor, func_call)