skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ import fastapi
11
11
  from sky import global_user_state
12
12
  from sky import sky_logging
13
13
  from sky.server.requests import requests as requests_lib
14
+ from sky.utils import common_utils
14
15
  from sky.utils import message_utils
15
16
  from sky.utils import rich_utils
16
17
  from sky.utils import status_lib
@@ -24,7 +25,22 @@ logger = sky_logging.init_logger(__name__)
24
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
25
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
26
27
  _HEARTBEAT_INTERVAL = 30
27
- _CLUSTER_STATUS_INTERVAL = 1
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
30
+ # If a SHORT request has been stuck in pending for
31
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
32
+ _SHORT_REQUEST_SPINNER_TIMEOUT = 2
33
+ # If there is an issue during provisioning that causes the cluster to be stuck
34
+ # in INIT state, we use this timeout to break the loop and stop streaming
35
+ # provision logs.
36
+ _PROVISION_LOG_TIMEOUT = 3
37
+ # Maximum time to wait for new log files to appear when streaming worker node
38
+ # provision logs. Worker logs are created sequentially during the provisioning
39
+ # process, so we need to wait for new files to appear.
40
+ _MAX_WAIT_FOR_NEW_LOG_FILES = 3 # seconds
41
+
42
+ LONG_REQUEST_POLL_INTERVAL = 1
43
+ DEFAULT_POLL_INTERVAL = 0.1
28
44
 
29
45
 
30
46
  async def _yield_log_file_with_payloads_skipped(
@@ -41,18 +57,22 @@ async def _yield_log_file_with_payloads_skipped(
41
57
 
42
58
 
43
59
  async def log_streamer(
44
- request_id: Optional[str],
45
- log_path: pathlib.Path,
46
- plain_logs: bool = False,
47
- tail: Optional[int] = None,
48
- follow: bool = True,
49
- cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
60
+ request_id: Optional[str],
61
+ log_path: Optional[pathlib.Path] = None,
62
+ plain_logs: bool = False,
63
+ tail: Optional[int] = None,
64
+ follow: bool = True,
65
+ cluster_name: Optional[str] = None,
66
+ polling_interval: float = DEFAULT_POLL_INTERVAL
67
+ ) -> AsyncGenerator[str, None]:
50
68
  """Streams the logs of a request.
51
69
 
52
70
  Args:
53
71
  request_id: The request ID to check whether the log tailing process
54
72
  should be stopped.
55
- log_path: The path to the log file.
73
+ log_path: The path to the log file or directory containing the log
74
+ files. If it is a directory, all *.log files in the directory will be
75
+ streamed.
56
76
  plain_logs: Whether to show plain logs.
57
77
  tail: The number of lines to tail. If None, tail the whole file.
58
78
  follow: Whether to follow the log file.
@@ -61,17 +81,26 @@ async def log_streamer(
61
81
  """
62
82
 
63
83
  if request_id is not None:
84
+ start_time = asyncio.get_event_loop().time()
64
85
  status_msg = rich_utils.EncodedStatusMessage(
65
86
  f'[dim]Checking request: {request_id}[/dim]')
66
- request_task = await requests_lib.get_request_async(request_id)
87
+ request_task = await requests_lib.get_request_async(request_id,
88
+ fields=[
89
+ 'request_id',
90
+ 'name',
91
+ 'schedule_type',
92
+ 'status',
93
+ 'status_msg'
94
+ ])
67
95
 
68
96
  if request_task is None:
69
97
  raise fastapi.HTTPException(
70
98
  status_code=404, detail=f'Request {request_id} not found')
71
99
  request_id = request_task.request_id
72
100
 
73
- # Do not show the waiting spinner if the request is a fast, non-blocking
74
- # request.
101
+ # By default, do not show the waiting spinner for SHORT requests.
102
+ # If the request has been stuck in pending for
103
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
75
104
  show_request_waiting_spinner = (not plain_logs and
76
105
  request_task.schedule_type
77
106
  == requests_lib.ScheduleType.LONG)
@@ -84,9 +113,23 @@ async def log_streamer(
84
113
  f'scheduled: {request_id}')
85
114
  req_status = request_task.status
86
115
  req_msg = request_task.status_msg
116
+ del request_task
117
+ # Slowly back off the database polling up to every 1 second, to avoid
118
+ # overloading the CPU and DB.
119
+ backoff = common_utils.Backoff(initial_backoff=polling_interval,
120
+ max_backoff_factor=10,
121
+ multiplier=1.2)
87
122
  while req_status < requests_lib.RequestStatus.RUNNING:
123
+ current_time = asyncio.get_event_loop().time()
124
+ # Show the waiting spinner for a SHORT request if it has been stuck
125
+ # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
126
+ if not show_request_waiting_spinner and (
127
+ current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
128
+ show_request_waiting_spinner = True
129
+ yield status_msg.init()
130
+ yield status_msg.start()
88
131
  if req_msg is not None:
89
- waiting_msg = request_task.status_msg
132
+ waiting_msg = req_msg
90
133
  if show_request_waiting_spinner:
91
134
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
92
135
  elif plain_logs and waiting_msg != last_waiting_msg:
@@ -99,7 +142,7 @@ async def log_streamer(
99
142
  # TODO(aylei): we should use a better mechanism to avoid busy
100
143
  # polling the DB, which can be a bottleneck for high-concurrency
101
144
  # requests.
102
- await asyncio.sleep(0.1)
145
+ await asyncio.sleep(backoff.current_backoff())
103
146
  status_with_msg = await requests_lib.get_request_status_async(
104
147
  request_id, include_msg=True)
105
148
  req_status = status_with_msg.status
@@ -109,19 +152,68 @@ async def log_streamer(
109
152
  if show_request_waiting_spinner:
110
153
  yield status_msg.stop()
111
154
 
112
- async with aiofiles.open(log_path, 'rb') as f:
113
- async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
114
- follow, cluster_name):
115
- yield chunk
155
+ if log_path is not None and log_path.is_dir():
156
+ # Track which log files we've already streamed
157
+ streamed_files = set()
158
+ no_new_files_count = 0
159
+
160
+ while True:
161
+ # Get all *.log files in the log_path
162
+ log_files = sorted(log_path.glob('*.log'))
163
+
164
+ # Filter out already streamed files
165
+ new_files = [f for f in log_files if f not in streamed_files]
166
+
167
+ if len(new_files) == 0:
168
+ if not follow:
169
+ break
170
+ # Wait a bit to see if new files appear
171
+ await asyncio.sleep(0.5)
172
+ no_new_files_count += 1
173
+ # Check if we've waited too long for new files
174
+ if no_new_files_count > _MAX_WAIT_FOR_NEW_LOG_FILES * 2:
175
+ break
176
+ continue
177
+
178
+ # Reset the no-new-files counter when we find new files
179
+ no_new_files_count = 0
180
+
181
+ for log_file_path in new_files:
182
+ # Add header before each file (similar to tail -f behavior)
183
+ header = f'\n==> {log_file_path} <==\n\n'
184
+ yield header
185
+
186
+ async with aiofiles.open(log_file_path, 'rb') as f:
187
+ async for chunk in _tail_log_file(f, request_id, plain_logs,
188
+ tail, follow,
189
+ cluster_name,
190
+ polling_interval):
191
+ yield chunk
192
+
193
+ # Mark this file as streamed
194
+ streamed_files.add(log_file_path)
195
+
196
+ # If not following, break after streaming all current files
197
+ if not follow:
198
+ break
199
+ else:
200
+ assert log_path is not None, (request_id, log_path)
201
+ async with aiofiles.open(log_path, 'rb') as f:
202
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
203
+ follow, cluster_name,
204
+ polling_interval):
205
+ yield chunk
116
206
 
117
207
 
118
208
  async def _tail_log_file(
119
- f: aiofiles.threadpool.binary.AsyncBufferedReader,
120
- request_id: Optional[str] = None,
121
- plain_logs: bool = False,
122
- tail: Optional[int] = None,
123
- follow: bool = True,
124
- cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
209
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
210
+ request_id: Optional[str] = None,
211
+ plain_logs: bool = False,
212
+ tail: Optional[int] = None,
213
+ follow: bool = True,
214
+ cluster_name: Optional[str] = None,
215
+ polling_interval: float = DEFAULT_POLL_INTERVAL
216
+ ) -> AsyncGenerator[str, None]:
125
217
  """Tail the opened log file, buffer the lines and flush in chunks."""
126
218
 
127
219
  if tail is not None:
@@ -137,7 +229,7 @@ async def _tail_log_file(
137
229
  yield line_str
138
230
 
139
231
  last_heartbeat_time = asyncio.get_event_loop().time()
140
- last_cluster_status_check_time = asyncio.get_event_loop().time()
232
+ last_status_check_time = asyncio.get_event_loop().time()
141
233
 
142
234
  # Buffer the lines in memory and flush them in chunks to improve log
143
235
  # tailing throughput.
@@ -145,6 +237,9 @@ async def _tail_log_file(
145
237
  buffer_bytes = 0
146
238
  last_flush_time = asyncio.get_event_loop().time()
147
239
 
240
+ # Read file in chunks instead of line-by-line for better performance
241
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
242
+
148
243
  async def flush_buffer() -> AsyncGenerator[str, None]:
149
244
  nonlocal buffer, buffer_bytes, last_flush_time
150
245
  if buffer:
@@ -165,16 +260,41 @@ async def _tail_log_file(
165
260
  async for chunk in flush_buffer():
166
261
  yield chunk
167
262
 
168
- line: Optional[bytes] = await f.readline()
169
- if not line:
170
- if request_id is not None:
263
+ # Read file in chunks for better I/O performance
264
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
265
+ if not file_chunk:
266
+ # Process any remaining incomplete line
267
+ if incomplete_line:
268
+ line_str = incomplete_line.decode('utf-8')
269
+ if plain_logs:
270
+ is_payload, line_str = message_utils.decode_payload(
271
+ line_str, raise_for_mismatch=False)
272
+ if not is_payload:
273
+ buffer.append(line_str)
274
+ buffer_bytes += len(line_str.encode('utf-8'))
275
+ else:
276
+ buffer.append(line_str)
277
+ buffer_bytes += len(line_str.encode('utf-8'))
278
+ incomplete_line = b''
279
+
280
+ # Avoid checking the status too frequently to avoid overloading the
281
+ # DB.
282
+ should_check_status = (current_time -
283
+ last_status_check_time) >= polling_interval
284
+ if not follow:
285
+ # We will only hit this path once, but we should make sure to
286
+ # check the status so that we display the final request status
287
+ # if the request is complete.
288
+ should_check_status = True
289
+ if request_id is not None and should_check_status:
290
+ last_status_check_time = current_time
171
291
  req_status = await requests_lib.get_request_status_async(
172
292
  request_id)
173
293
  if req_status.status > requests_lib.RequestStatus.RUNNING:
174
294
  if (req_status.status ==
175
295
  requests_lib.RequestStatus.CANCELLED):
176
296
  request_task = await requests_lib.get_request_async(
177
- request_id)
297
+ request_id, fields=['name', 'should_retry'])
178
298
  if request_task.should_retry:
179
299
  buffer.append(
180
300
  message_utils.encode_payload(
@@ -183,22 +303,32 @@ async def _tail_log_file(
183
303
  buffer.append(
184
304
  f'{request_task.name!r} request {request_id}'
185
305
  ' cancelled\n')
306
+ del request_task
186
307
  break
187
308
  if not follow:
309
+ # The below checks (cluster status, heartbeat) are not needed
310
+ # for non-follow logs.
188
311
  break
189
312
  # Provision logs pass in cluster_name, check cluster status
190
- # periodically to see if provisioning is done. We only
191
- # check once a second to avoid overloading the DB.
192
- check_status = (current_time - last_cluster_status_check_time
193
- ) >= _CLUSTER_STATUS_INTERVAL
194
- if cluster_name is not None and check_status:
195
- cluster_record = await (
196
- global_user_state.get_status_from_cluster_name_async(
197
- cluster_name))
198
- if (cluster_record is None or
199
- cluster_record != status_lib.ClusterStatus.INIT):
313
+ # periodically to see if provisioning is done.
314
+ if cluster_name is not None:
315
+ if current_time - last_flush_time > _PROVISION_LOG_TIMEOUT:
200
316
  break
201
- last_cluster_status_check_time = current_time
317
+ if should_check_status:
318
+ last_status_check_time = current_time
319
+ cluster_status = await (
320
+ global_user_state.get_status_from_cluster_name_async(
321
+ cluster_name))
322
+ if cluster_status is None:
323
+ logger.debug(
324
+ 'Stop tailing provision logs for cluster'
325
+ f' status for cluster {cluster_name} not found')
326
+ break
327
+ if cluster_status != status_lib.ClusterStatus.INIT:
328
+ logger.debug(
329
+ f'Stop tailing provision logs for cluster'
330
+ f' {cluster_name} has status {cluster_status} '
331
+ '(not in INIT state)')
202
332
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
203
333
  # Currently just used to keep the connection busy, refer to
204
334
  # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -218,38 +348,82 @@ async def _tail_log_file(
218
348
  # performance but it helps avoid unnecessary heartbeat strings
219
349
  # being printed when the client runs in an old version.
220
350
  last_heartbeat_time = asyncio.get_event_loop().time()
221
- line_str = line.decode('utf-8')
222
- if plain_logs:
223
- is_payload, line_str = message_utils.decode_payload(
224
- line_str, raise_for_mismatch=False)
225
- # TODO(aylei): implement heartbeat mechanism for plain logs,
226
- # sending invisible characters might be okay.
227
- if is_payload:
228
- continue
229
- buffer.append(line_str)
230
- buffer_bytes += len(line_str.encode('utf-8'))
351
+
352
+ # Combine with any incomplete line from previous chunk
353
+ file_chunk = incomplete_line + file_chunk
354
+ incomplete_line = b''
355
+
356
+ # Split chunk into lines, preserving line structure
357
+ lines_bytes = file_chunk.split(b'\n')
358
+
359
+ # If chunk doesn't end with newline, the last element is incomplete
360
+ if file_chunk and not file_chunk.endswith(b'\n'):
361
+ incomplete_line = lines_bytes[-1]
362
+ lines_bytes = lines_bytes[:-1]
363
+ else:
364
+ # If ends with \n, split creates an empty last element we should
365
+ # ignore
366
+ if lines_bytes and lines_bytes[-1] == b'':
367
+ lines_bytes = lines_bytes[:-1]
368
+
369
+ # Process all complete lines in this chunk
370
+ for line_bytes in lines_bytes:
371
+ # Reconstruct line with newline (since split removed it)
372
+ line_str = line_bytes.decode('utf-8') + '\n'
373
+
374
+ if plain_logs:
375
+ is_payload, line_str = message_utils.decode_payload(
376
+ line_str, raise_for_mismatch=False)
377
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
378
+ # sending invisible characters might be okay.
379
+ if is_payload:
380
+ continue
381
+
382
+ buffer.append(line_str)
383
+ buffer_bytes += len(line_str.encode('utf-8'))
231
384
 
232
385
  # Flush remaining lines in the buffer.
233
386
  async for chunk in flush_buffer():
234
387
  yield chunk
235
388
 
236
389
 
390
+ def stream_response_for_long_request(
391
+ request_id: str,
392
+ logs_path: pathlib.Path,
393
+ background_tasks: fastapi.BackgroundTasks,
394
+ kill_request_on_disconnect: bool = True,
395
+ ) -> fastapi.responses.StreamingResponse:
396
+ """Stream the logs of a long request."""
397
+ return stream_response(
398
+ request_id,
399
+ logs_path,
400
+ background_tasks,
401
+ polling_interval=LONG_REQUEST_POLL_INTERVAL,
402
+ kill_request_on_disconnect=kill_request_on_disconnect,
403
+ )
404
+
405
+
237
406
  def stream_response(
238
- request_id: str, logs_path: pathlib.Path,
239
- background_tasks: fastapi.BackgroundTasks
407
+ request_id: str,
408
+ logs_path: pathlib.Path,
409
+ background_tasks: fastapi.BackgroundTasks,
410
+ polling_interval: float = DEFAULT_POLL_INTERVAL,
411
+ kill_request_on_disconnect: bool = True,
240
412
  ) -> fastapi.responses.StreamingResponse:
241
413
 
242
- async def on_disconnect():
243
- logger.info(f'User terminated the connection for request '
244
- f'{request_id}')
245
- requests_lib.kill_requests([request_id])
414
+ if kill_request_on_disconnect:
415
+
416
+ async def on_disconnect():
417
+ logger.info(f'User terminated the connection for request '
418
+ f'{request_id}')
419
+ await requests_lib.kill_request_async(request_id)
246
420
 
247
- # The background task will be run after returning a response.
248
- # https://fastapi.tiangolo.com/tutorial/background-tasks/
249
- background_tasks.add_task(on_disconnect)
421
+ # The background task will be run after returning a response.
422
+ # https://fastapi.tiangolo.com/tutorial/background-tasks/
423
+ background_tasks.add_task(on_disconnect)
250
424
 
251
425
  return fastapi.responses.StreamingResponse(
252
- log_streamer(request_id, logs_path),
426
+ log_streamer(request_id, logs_path, polling_interval=polling_interval),
253
427
  media_type='text/plain',
254
428
  headers={
255
429
  'Cache-Control': 'no-cache, no-transform',
sky/server/uvicorn.py CHANGED
@@ -46,11 +46,11 @@ except ValueError:
46
46
 
47
47
  # TODO(aylei): use decorator to register requests that need to be proactively
48
48
  # cancelled instead of hardcoding here.
49
- _RETRIABLE_REQUEST_NAMES = [
49
+ _RETRIABLE_REQUEST_NAMES = {
50
50
  'sky.logs',
51
51
  'sky.jobs.logs',
52
52
  'sky.serve.logs',
53
- ]
53
+ }
54
54
 
55
55
 
56
56
  def add_timestamp_prefix_for_server_logs() -> None:
@@ -151,37 +151,38 @@ class Server(uvicorn.Server):
151
151
  requests_lib.RequestStatus.PENDING,
152
152
  requests_lib.RequestStatus.RUNNING,
153
153
  ]
154
- reqs = requests_lib.get_request_tasks(
155
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
156
- if not reqs:
154
+ requests = [(request_task.request_id, request_task.name)
155
+ for request_task in requests_lib.get_request_tasks(
156
+ req_filter=requests_lib.RequestTaskFilter(
157
+ status=statuses, fields=['request_id', 'name']))
158
+ ]
159
+ if not requests:
157
160
  break
158
- logger.info(f'{len(reqs)} on-going requests '
161
+ logger.info(f'{len(requests)} on-going requests '
159
162
  'found, waiting for them to finish...')
160
163
  # Proactively cancel internal requests and logs requests since
161
164
  # they can run for infinite time.
162
- internal_request_ids = [
165
+ internal_request_ids = {
163
166
  d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
164
- ]
167
+ }
165
168
  if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
166
169
  logger.warning('Timeout waiting for on-going requests to '
167
170
  'finish, cancelling all on-going requests.')
168
- for req in reqs:
169
- self.interrupt_request_for_retry(req.request_id)
171
+ for request_id, _ in requests:
172
+ self.interrupt_request_for_retry(request_id)
170
173
  break
171
174
  interrupted = 0
172
- for req in reqs:
173
- if req.request_id in internal_request_ids:
174
- self.interrupt_request_for_retry(req.request_id)
175
- interrupted += 1
176
- elif req.name in _RETRIABLE_REQUEST_NAMES:
177
- self.interrupt_request_for_retry(req.request_id)
175
+ for request_id, name in requests:
176
+ if (name in _RETRIABLE_REQUEST_NAMES or
177
+ request_id in internal_request_ids):
178
+ self.interrupt_request_for_retry(request_id)
178
179
  interrupted += 1
179
180
  # TODO(aylei): interrupt pending requests to accelerate the
180
181
  # shutdown.
181
182
  # If some requests are not interrupted, wait for them to finish,
182
183
  # otherwise we just check again immediately to accelerate the
183
184
  # shutdown process.
184
- if interrupted < len(reqs):
185
+ if interrupted < len(requests):
185
186
  time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
186
187
 
187
188
  def interrupt_request_for_retry(self, request_id: str) -> None:
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
98
98
  version_locations = %(here)s/../schemas/db/serve_state
99
99
  version_table = alembic_version_serve_state_db
100
100
 
101
+ [sky_config_db]
102
+ version_locations = %(here)s/../schemas/db/skypilot_config
103
+ version_table = alembic_version_sky_config_db
104
+
101
105
  [post_write_hooks]
102
106
  # post_write_hooks defines scripts or Python functions that are run
103
107
  # on newly generated revision scripts. See the documentation for further
@@ -49,6 +49,7 @@ install_requires = [
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
51
  'ijson',
52
+ 'orjson',
52
53
  'requests',
53
54
  # SkyPilot inherits from uvicorn.Server to customize the behavior of
54
55
  # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -86,7 +87,6 @@ install_requires = [
86
87
  'types-paramiko',
87
88
  'alembic',
88
89
  'aiohttp',
89
- 'aiosqlite',
90
90
  'anyio',
91
91
  ]
92
92
 
@@ -104,6 +104,10 @@ GRPC = 'grpcio>=1.63.0'
104
104
  PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
105
105
 
106
106
  server_dependencies = [
107
+ # TODO: Some of these dependencies are also specified in install_requires,
108
+ # so they are redundant here. We should figure out if they are only needed
109
+ # on the server (should remove from install_requires), or if they are needed
110
+ # on the client (should remove from here).
107
111
  'casbin',
108
112
  'sqlalchemy_adapter',
109
113
  'passlib',
@@ -144,11 +148,19 @@ aws_dependencies = [
144
148
  'colorama < 0.4.5',
145
149
  ]
146
150
 
151
+ # Kubernetes 32.0.0 has an authentication bug:
152
+ # https://github.com/kubernetes-client/python/issues/2333
153
+ kubernetes_dependencies = [
154
+ 'kubernetes>=20.0.0,!=32.0.0',
155
+ 'websockets',
156
+ 'python-dateutil',
157
+ ]
158
+
147
159
  # azure-cli cannot be installed normally by uv, so we need to work around it in
148
160
  # a few places.
149
161
  AZURE_CLI = 'azure-cli>=2.65.0'
150
162
 
151
- extras_require: Dict[str, List[str]] = {
163
+ cloud_dependencies: Dict[str, List[str]] = {
152
164
  'aws': aws_dependencies,
153
165
  # TODO(zongheng): azure-cli is huge and takes a long time to install.
154
166
  # Tracked in: https://github.com/Azure/azure-cli/issues/7387
@@ -184,14 +196,11 @@ extras_require: Dict[str, List[str]] = {
184
196
  'docker': ['docker'] + local_ray,
185
197
  'lambda': [], # No dependencies needed for lambda
186
198
  'cloudflare': aws_dependencies,
199
+ 'coreweave': aws_dependencies + kubernetes_dependencies,
187
200
  'scp': local_ray,
188
201
  'oci': ['oci'],
189
- # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
190
- 'kubernetes': [
191
- 'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
192
- ],
193
- 'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'],
194
- 'remote': remote,
202
+ 'kubernetes': kubernetes_dependencies,
203
+ 'ssh': kubernetes_dependencies,
195
204
  # For the container registry auth api. Reference:
196
205
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
197
206
  # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
@@ -221,12 +230,11 @@ extras_require: Dict[str, List[str]] = {
221
230
  ] + aws_dependencies,
222
231
  'hyperbolic': [], # No dependencies needed for hyperbolic
223
232
  'seeweb': ['ecsapi>=0.2.0'],
224
- 'server': server_dependencies,
233
+ 'shadeform': [], # No dependencies needed for shadeform
225
234
  }
226
235
 
227
236
  # Calculate which clouds should be included in the [all] installation.
228
- clouds_for_all = set(extras_require)
229
- clouds_for_all.remove('remote')
237
+ clouds_for_all = set(cloud_dependencies)
230
238
 
231
239
  if sys.version_info < (3, 10):
232
240
  # Nebius needs python3.10. If python 3.9 [all] will not install nebius
@@ -241,5 +249,16 @@ if sys.version_info >= (3, 12):
241
249
  # TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
242
250
  clouds_for_all.remove('vast')
243
251
 
244
- extras_require['all'] = list(
245
- set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
252
+ cloud_extras = {
253
+ cloud: dependencies + server_dependencies
254
+ for cloud, dependencies in cloud_dependencies.items()
255
+ }
256
+
257
+ extras_require: Dict[str, List[str]] = {
258
+ # Include server_dependencies with each cloud.
259
+ **cloud_extras,
260
+ 'all': list(set().union(*[cloud_extras[cloud] for cloud in clouds_for_all])
261
+ ),
262
+ 'remote': remote,
263
+ 'server': server_dependencies,
264
+ }
sky/sky_logging.py CHANGED
@@ -109,7 +109,6 @@ def _setup_logger():
109
109
  global _default_handler
110
110
  if _default_handler is None:
111
111
  _default_handler = EnvAwareHandler(sys.stdout)
112
- _default_handler.flush = sys.stdout.flush # type: ignore
113
112
  if env_options.Options.SHOW_DEBUG_INFO.get():
114
113
  _default_handler.setLevel(logging.DEBUG)
115
114
  else:
@@ -129,7 +128,6 @@ def _setup_logger():
129
128
  for logger_name in _SENSITIVE_LOGGER:
130
129
  logger = logging.getLogger(logger_name)
131
130
  handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
132
- handler_to_logger.flush = sys.stdout.flush # type: ignore
133
131
  logger.addHandler(handler_to_logger)
134
132
  logger.setLevel(logging.INFO)
135
133
  if _show_logging_prefix():