skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/backends/backend_utils.py +9 -6
  5. sky/backends/cloud_vm_ray_backend.py +2 -3
  6. sky/check.py +25 -13
  7. sky/client/cli/command.py +52 -24
  8. sky/cloud_stores.py +73 -0
  9. sky/clouds/aws.py +59 -11
  10. sky/core.py +7 -5
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  15. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
  28. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/data/data_utils.py +92 -1
  45. sky/data/mounting_utils.py +71 -2
  46. sky/data/storage.py +166 -9
  47. sky/global_user_state.py +14 -18
  48. sky/jobs/constants.py +2 -0
  49. sky/jobs/controller.py +62 -67
  50. sky/jobs/file_content_utils.py +80 -0
  51. sky/jobs/log_gc.py +201 -0
  52. sky/jobs/scheduler.py +15 -2
  53. sky/jobs/server/core.py +85 -13
  54. sky/jobs/server/server.py +14 -13
  55. sky/jobs/server/utils.py +28 -10
  56. sky/jobs/state.py +216 -40
  57. sky/jobs/utils.py +65 -28
  58. sky/metrics/utils.py +18 -0
  59. sky/optimizer.py +1 -1
  60. sky/provision/kubernetes/instance.py +88 -19
  61. sky/provision/kubernetes/volume.py +2 -2
  62. sky/schemas/api/responses.py +3 -5
  63. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  64. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  65. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  66. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  67. sky/serve/replica_managers.py +2 -2
  68. sky/serve/serve_utils.py +9 -2
  69. sky/serve/server/server.py +8 -7
  70. sky/server/common.py +21 -15
  71. sky/server/constants.py +1 -1
  72. sky/server/daemons.py +23 -17
  73. sky/server/requests/executor.py +7 -3
  74. sky/server/requests/payloads.py +2 -0
  75. sky/server/requests/request_names.py +80 -0
  76. sky/server/requests/requests.py +137 -102
  77. sky/server/requests/serializers/decoders.py +0 -6
  78. sky/server/requests/serializers/encoders.py +33 -6
  79. sky/server/server.py +105 -36
  80. sky/server/stream_utils.py +56 -13
  81. sky/setup_files/dependencies.py +2 -0
  82. sky/skylet/constants.py +6 -1
  83. sky/skylet/events.py +7 -0
  84. sky/skylet/services.py +18 -7
  85. sky/ssh_node_pools/server.py +5 -4
  86. sky/task.py +14 -42
  87. sky/templates/kubernetes-ray.yml.j2 +1 -1
  88. sky/templates/nebius-ray.yml.j2 +1 -0
  89. sky/templates/websocket_proxy.py +140 -12
  90. sky/users/permission.py +4 -1
  91. sky/utils/cli_utils/status_utils.py +8 -2
  92. sky/utils/context_utils.py +13 -1
  93. sky/utils/db/migration_utils.py +1 -1
  94. sky/utils/resource_checker.py +4 -1
  95. sky/utils/resources_utils.py +53 -29
  96. sky/utils/schemas.py +23 -4
  97. sky/volumes/server/server.py +4 -3
  98. sky/workspaces/server.py +7 -6
  99. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
  100. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
  101. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  102. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  108. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  109. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  110. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  111. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  112. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  113. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  114. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import fastapi
7
7
  from sky import core as sky_core
8
8
  from sky.server.requests import executor
9
9
  from sky.server.requests import payloads
10
+ from sky.server.requests import request_names
10
11
  from sky.server.requests import requests as requests_lib
11
12
  from sky.ssh_node_pools import core as ssh_node_pools_core
12
13
  from sky.utils import common_utils
@@ -101,7 +102,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
101
102
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
102
103
  await executor.schedule_request_async(
103
104
  request_id=request.state.request_id,
104
- request_name='ssh_up',
105
+ request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
105
106
  request_body=ssh_up_body,
106
107
  func=sky_core.ssh_up,
107
108
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -126,7 +127,7 @@ async def deploy_ssh_node_pool_general(
126
127
  try:
127
128
  await executor.schedule_request_async(
128
129
  request_id=request.state.request_id,
129
- request_name='ssh_up',
130
+ request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
130
131
  request_body=ssh_up_body,
131
132
  func=sky_core.ssh_up,
132
133
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -152,7 +153,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
152
153
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
153
154
  await executor.schedule_request_async(
154
155
  request_id=request.state.request_id,
155
- request_name='ssh_down',
156
+ request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
156
157
  request_body=ssh_up_body,
157
158
  func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
158
159
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -180,7 +181,7 @@ async def down_ssh_node_pool_general(
180
181
  ssh_up_body.cleanup = True
181
182
  await executor.schedule_request_async(
182
183
  request_id=request.state.request_id,
183
- request_name='ssh_down',
184
+ request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
184
185
  request_body=ssh_up_body,
185
186
  func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
186
187
  schedule_type=requests_lib.ScheduleType.LONG,
sky/task.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Task: a coarse-grained stage in an application."""
2
2
  import collections
3
- import inspect
4
3
  import json
5
4
  import os
6
5
  import re
@@ -29,10 +28,6 @@ from sky.utils import yaml_utils
29
28
 
30
29
  logger = sky_logging.init_logger(__name__)
31
30
 
32
- # A lambda generating commands (node rank_i, node addrs -> cmd_i).
33
- CommandGen = Callable[[int, List[str]], Optional[str]]
34
- CommandOrCommandGen = Union[str, CommandGen]
35
-
36
31
  _VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
37
32
  _VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
38
33
  ' uppercase letters, digits, underscores, periods,'
@@ -236,7 +231,7 @@ class Task:
236
231
  name: Optional[str] = None,
237
232
  *,
238
233
  setup: Optional[Union[str, List[str]]] = None,
239
- run: Optional[Union[CommandOrCommandGen, List[str]]] = None,
234
+ run: Optional[Union[str, List[str]]] = None,
240
235
  envs: Optional[Dict[str, str]] = None,
241
236
  secrets: Optional[Dict[str, str]] = None,
242
237
  workdir: Optional[Union[str, Dict[str, Any]]] = None,
@@ -349,7 +344,7 @@ class Task:
349
344
  self._volumes = volumes or {}
350
345
 
351
346
  # concatenate commands if given as list
352
- def _concat(commands):
347
+ def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
353
348
  if isinstance(commands, list):
354
349
  return '\n'.join(commands)
355
350
  return commands
@@ -447,42 +442,9 @@ class Task:
447
442
 
448
443
  def validate_run(self):
449
444
  """Validates if the run command is valid."""
450
- if callable(self.run):
451
- run_sig = inspect.signature(self.run)
452
- # Check that run is a function with 2 arguments.
453
- if len(run_sig.parameters) != 2:
454
- with ux_utils.print_exception_no_traceback():
455
- raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
456
-
457
- type_list = [int, List[str]]
458
- # Check annotations, if exists
459
- for i, param in enumerate(run_sig.parameters.values()):
460
- if param.annotation != inspect.Parameter.empty:
461
- if param.annotation != type_list[i]:
462
- with ux_utils.print_exception_no_traceback():
463
- raise ValueError(
464
- _RUN_FN_CHECK_FAIL_MSG.format(run_sig))
465
-
466
- # Check self containedness.
467
- run_closure = inspect.getclosurevars(self.run)
468
- if run_closure.nonlocals:
469
- with ux_utils.print_exception_no_traceback():
470
- raise ValueError(
471
- 'run command generator must be self contained. '
472
- f'Found nonlocals: {run_closure.nonlocals}')
473
- if run_closure.globals:
474
- with ux_utils.print_exception_no_traceback():
475
- raise ValueError(
476
- 'run command generator must be self contained. '
477
- f'Found globals: {run_closure.globals}')
478
- if run_closure.unbound:
479
- # Do not raise an error here. Import statements, which are
480
- # allowed, will be considered as unbounded.
481
- pass
482
- elif self.run is not None and not isinstance(self.run, str):
445
+ if self.run is not None and not isinstance(self.run, str):
483
446
  with ux_utils.print_exception_no_traceback():
484
- raise ValueError('run must be either a shell script (str) or '
485
- f'a command generator ({CommandGen}). '
447
+ raise ValueError('run must be a shell script (str). '
486
448
  f'Got {type(self.run)}')
487
449
 
488
450
  def expand_and_validate_file_mounts(self):
@@ -1552,6 +1514,16 @@ class Task:
1552
1514
  self.update_file_mounts({
1553
1515
  mnt_path: blob_path,
1554
1516
  })
1517
+ elif store_type is storage_lib.StoreType.COREWEAVE:
1518
+ if storage.source is not None and not isinstance(
1519
+ storage.source,
1520
+ list) and storage.source.startswith('cw://'):
1521
+ blob_path = storage.source
1522
+ else:
1523
+ blob_path = 'cw://' + storage.name
1524
+ self.update_file_mounts({
1525
+ mnt_path: blob_path,
1526
+ })
1555
1527
  else:
1556
1528
  with ux_utils.print_exception_no_traceback():
1557
1529
  raise ValueError(f'Storage Type {store_type} '
@@ -1059,7 +1059,7 @@ available_node_types:
1059
1059
  # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
1060
1060
  # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
1061
1061
  # will delete the service from the database after it is terminated so everything in the database is running.
1062
- ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1062
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1063
1063
  if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
1064
1064
  read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
1065
1065
  fi
@@ -156,6 +156,7 @@ setup_commands:
156
156
  echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
157
157
  {%- endfor %}
158
158
  {%- endif %}
159
+ IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
159
160
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
160
161
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
161
162
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -11,15 +11,23 @@ This script is useful for users who do not have local Kubernetes credentials.
11
11
  import asyncio
12
12
  from http.cookiejar import MozillaCookieJar
13
13
  import os
14
+ import struct
14
15
  import sys
15
- from typing import Dict
16
+ import time
17
+ from typing import Dict, Optional
16
18
  from urllib.request import Request
17
19
 
20
+ import requests
18
21
  import websockets
19
22
  from websockets.asyncio.client import ClientConnection
20
23
  from websockets.asyncio.client import connect
21
24
 
25
+ from sky.server import constants
26
+ from sky.server.server import KubernetesSSHMessageType
27
+ from sky.skylet import constants as skylet_constants
28
+
22
29
  BUFFER_SIZE = 2**16 # 64KB
30
+ HEARTBEAT_INTERVAL_SECONDS = 10
23
31
 
24
32
  # Environment variable for a file path to the API cookie file.
25
33
  # Keep in sync with server/constants.py
@@ -28,6 +36,8 @@ API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
28
36
  # Keep in sync with server/constants.py
29
37
  API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
30
38
 
39
+ MAX_UNANSWERED_PINGS = 100
40
+
31
41
 
32
42
  def _get_cookie_header(url: str) -> Dict[str, str]:
33
43
  """Extract Cookie header value from a cookie jar for a specific URL"""
@@ -49,7 +59,7 @@ def _get_cookie_header(url: str) -> Dict[str, str]:
49
59
  return {'Cookie': cookie_header}
50
60
 
51
61
 
52
- async def main(url: str) -> None:
62
+ async def main(url: str, timestamps_supported: bool) -> None:
53
63
  cookie_header = _get_cookie_header(url)
54
64
  async with connect(url,
55
65
  ping_interval=None,
@@ -75,45 +85,149 @@ async def main(url: str) -> None:
75
85
  asyncio.streams.FlowControlMixin, sys.stdout) # type: ignore
76
86
  stdout_writer = asyncio.StreamWriter(transport, protocol, None,
77
87
  loop)
88
+ # Dictionary to store last ping time for latency measurement
89
+ last_ping_time_dict: Optional[Dict[int, float]] = None
90
+ if timestamps_supported:
91
+ last_ping_time_dict = {}
92
+
93
+ # Use an Event to signal when websocket is closed
94
+ websocket_closed_event = asyncio.Event()
95
+ websocket_lock = asyncio.Lock()
78
96
 
79
- await asyncio.gather(stdin_to_websocket(stdin_reader, websocket),
80
- websocket_to_stdout(websocket, stdout_writer))
97
+ await asyncio.gather(
98
+ stdin_to_websocket(stdin_reader, websocket,
99
+ timestamps_supported, websocket_closed_event,
100
+ websocket_lock),
101
+ websocket_to_stdout(websocket, stdout_writer,
102
+ timestamps_supported, last_ping_time_dict,
103
+ websocket_closed_event, websocket_lock),
104
+ latency_monitor(websocket, last_ping_time_dict,
105
+ websocket_closed_event, websocket_lock),
106
+ return_exceptions=True)
81
107
  finally:
82
108
  if old_settings:
83
109
  termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
84
110
  old_settings)
85
111
 
86
112
 
113
+ async def latency_monitor(websocket: ClientConnection,
114
+ last_ping_time_dict: Optional[dict],
115
+ websocket_closed_event: asyncio.Event,
116
+ websocket_lock: asyncio.Lock):
117
+ """Periodically send PING messages (type 1) to measure latency."""
118
+ if last_ping_time_dict is None:
119
+ return
120
+ next_id = 0
121
+ while not websocket_closed_event.is_set():
122
+ try:
123
+ await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS)
124
+ if len(last_ping_time_dict) >= MAX_UNANSWERED_PINGS:
125
+ # We are not getting responses, clear the dictionary so
126
+ # as not to grow unbounded.
127
+ last_ping_time_dict.clear()
128
+ ping_time = time.time()
129
+ next_id += 1
130
+ last_ping_time_dict[next_id] = ping_time
131
+ message_header_bytes = struct.pack(
132
+ '!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
133
+ try:
134
+ async with websocket_lock:
135
+ await websocket.send(message_header_bytes)
136
+ except websockets.exceptions.ConnectionClosed as e:
137
+ # Websocket is already closed.
138
+ print(f'Failed to send PING message: {e}', file=sys.stderr)
139
+ break
140
+ except Exception as e:
141
+ print(f'Error in latency_monitor: {e}', file=sys.stderr)
142
+ websocket_closed_event.set()
143
+ raise e
144
+
145
+
87
146
  async def stdin_to_websocket(reader: asyncio.StreamReader,
88
- websocket: ClientConnection):
147
+ websocket: ClientConnection,
148
+ timestamps_supported: bool,
149
+ websocket_closed_event: asyncio.Event,
150
+ websocket_lock: asyncio.Lock):
89
151
  try:
90
- while True:
152
+ while not websocket_closed_event.is_set():
91
153
  # Read at most BUFFER_SIZE bytes, this not affect
92
154
  # responsiveness since it will return as soon as
93
155
  # there is at least one byte.
94
156
  # The BUFFER_SIZE is chosen to be large enough to improve
95
157
  # throughput.
96
158
  data = await reader.read(BUFFER_SIZE)
159
+
97
160
  if not data:
98
161
  break
99
- await websocket.send(data)
162
+ if timestamps_supported:
163
+ # Send message with type 0 to indicate data.
164
+ message_type_bytes = struct.pack(
165
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
166
+ data = message_type_bytes + data
167
+ async with websocket_lock:
168
+ await websocket.send(data)
169
+
100
170
  except Exception as e: # pylint: disable=broad-except
101
171
  print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
102
172
  finally:
103
- await websocket.close()
173
+ async with websocket_lock:
174
+ await websocket.close()
175
+ websocket_closed_event.set()
104
176
 
105
177
 
106
178
  async def websocket_to_stdout(websocket: ClientConnection,
107
- writer: asyncio.StreamWriter):
179
+ writer: asyncio.StreamWriter,
180
+ timestamps_supported: bool,
181
+ last_ping_time_dict: Optional[dict],
182
+ websocket_closed_event: asyncio.Event,
183
+ websocket_lock: asyncio.Lock):
108
184
  try:
109
- while True:
185
+ while not websocket_closed_event.is_set():
110
186
  message = await websocket.recv()
187
+ if (timestamps_supported and len(message) > 0 and
188
+ last_ping_time_dict is not None):
189
+ message_type = struct.unpack('!B', message[:1])[0]
190
+ if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
191
+ # Regular data - strip type byte and write to stdout
192
+ message = message[1:]
193
+ elif message_type == KubernetesSSHMessageType.PINGPONG.value:
194
+ # PONG response - calculate latency and send measurement
195
+ if not len(message) == struct.calcsize('!BI'):
196
+ raise ValueError(
197
+ f'Invalid PONG message length: {len(message)}')
198
+ pong_id = struct.unpack('!I', message[1:5])[0]
199
+ pong_time = time.time()
200
+
201
+ ping_time = last_ping_time_dict.pop(pong_id, None)
202
+
203
+ if ping_time is None:
204
+ continue
205
+
206
+ latency_seconds = pong_time - ping_time
207
+ latency_ms = int(latency_seconds * 1000)
208
+
209
+ # Send latency measurement (type 2)
210
+ message_type_bytes = struct.pack(
211
+ '!B',
212
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
213
+ latency_bytes = struct.pack('!Q', latency_ms)
214
+ message = message_type_bytes + latency_bytes
215
+ # Send to server.
216
+ async with websocket_lock:
217
+ await websocket.send(message)
218
+ continue
219
+ # No timestamps support, write directly
111
220
  writer.write(message)
112
221
  await writer.drain()
113
222
  except websockets.exceptions.ConnectionClosed:
114
223
  print('WebSocket connection closed', file=sys.stderr)
115
224
  except Exception as e: # pylint: disable=broad-except
116
225
  print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
226
+ raise e
227
+ finally:
228
+ async with websocket_lock:
229
+ await websocket.close()
230
+ websocket_closed_event.set()
117
231
 
118
232
 
119
233
  if __name__ == '__main__':
@@ -123,11 +237,25 @@ if __name__ == '__main__':
123
237
  # TODO(aylei): Remove this after 0.10.0
124
238
  server_url = f'http://{server_url}'
125
239
 
240
+ health_url = f'{server_url}/api/health'
241
+ health_response = requests.get(health_url)
242
+ health_data = health_response.json()
243
+ timestamps_are_supported = int(health_data['api_version']) > 21
244
+ disable_latency_measurement = os.environ.get(
245
+ skylet_constants.SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR, '0') == '1'
246
+ timestamps_are_supported = (timestamps_are_supported and
247
+ not disable_latency_measurement)
248
+
126
249
  server_proto, server_fqdn = server_url.split('://')
127
250
  websocket_proto = 'ws'
128
251
  if server_proto == 'https':
129
252
  websocket_proto = 'wss'
130
253
  server_url = f'{websocket_proto}://{server_fqdn}'
254
+
255
+ client_version_str = (f'&client_version={constants.API_VERSION}'
256
+ if timestamps_are_supported else '')
257
+
131
258
  websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
132
- f'?cluster_name={sys.argv[2]}')
133
- asyncio.run(main(websocket_url))
259
+ f'?cluster_name={sys.argv[2]}'
260
+ f'{client_version_str}')
261
+ asyncio.run(main(websocket_url, timestamps_are_supported))
sky/users/permission.py CHANGED
@@ -43,7 +43,6 @@ class PermissionService:
43
43
  with _policy_lock():
44
44
  global _enforcer_instance
45
45
  if _enforcer_instance is None:
46
- _enforcer_instance = self
47
46
  engine = global_user_state.initialize_and_get_db()
48
47
  db_utils.add_all_tables_to_db_sqlalchemy(
49
48
  sqlalchemy_adapter.Base.metadata, engine)
@@ -53,6 +52,10 @@ class PermissionService:
53
52
  'model.conf')
54
53
  enforcer = casbin.Enforcer(model_path, adapter)
55
54
  self.enforcer = enforcer
55
+ # Only set the enforcer instance once the enforcer
56
+ # is successfully initialized, if we change it and then fail
57
+ # we will set it to None and all subsequent calls will fail.
58
+ _enforcer_instance = self
56
59
  self._maybe_initialize_policies()
57
60
  self._maybe_initialize_basic_auth_user()
58
61
  else:
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
282
282
  if resources_str_full is not None:
283
283
  resources_str = resources_str_full
284
284
  if resources_str is None:
285
- resources_str = resources_utils.get_readable_resources_repr(
286
- handle, simplify=truncate)
285
+ resources_str_simple, resources_str_full = (
286
+ resources_utils.get_readable_resources_repr(
287
+ handle, simplified_only=truncate))
288
+ if truncate:
289
+ resources_str = resources_str_simple
290
+ else:
291
+ assert resources_str_full is not None
292
+ resources_str = resources_str_full
287
293
 
288
294
  return resources_str
289
295
  return '-'
@@ -8,6 +8,7 @@ import multiprocessing
8
8
  import os
9
9
  import subprocess
10
10
  import sys
11
+ import time
11
12
  import typing
12
13
  from typing import Any, Callable, IO, Optional, Tuple, TypeVar
13
14
 
@@ -18,6 +19,7 @@ from sky.utils import context
18
19
  from sky.utils import subprocess_utils
19
20
 
20
21
  StreamHandler = Callable[[IO[Any], IO[Any]], str]
22
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS = 0.5
21
23
 
22
24
  logger = sky_logging.init_logger(__name__)
23
25
 
@@ -46,6 +48,7 @@ def hijack_sys_attrs():
46
48
 
47
49
  def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
48
50
  """Passthrough the stream from the process to the output stream"""
51
+ last_flush_time = time.time()
49
52
  wrapped = io.TextIOWrapper(in_stream,
50
53
  encoding='utf-8',
51
54
  newline='',
@@ -55,9 +58,18 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
55
58
  line = wrapped.readline()
56
59
  if line:
57
60
  out_stream.write(line)
58
- out_stream.flush()
61
+
62
+ # Flush based on timeout instead of on every line
63
+ current_time = time.time()
64
+ if (current_time - last_flush_time >=
65
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS):
66
+ out_stream.flush()
67
+ last_flush_time = current_time
59
68
  else:
60
69
  break
70
+
71
+ # Final flush to ensure all data is written
72
+ out_stream.flush()
61
73
  return ''
62
74
 
63
75
 
@@ -22,7 +22,7 @@ GLOBAL_USER_STATE_VERSION = '010'
22
22
  GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
23
23
 
24
24
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
25
- SPOT_JOBS_VERSION = '003'
25
+ SPOT_JOBS_VERSION = '005'
26
26
  SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
27
27
 
28
28
  SERVE_DB_NAME = 'serve_db'
@@ -278,7 +278,10 @@ def _get_active_resources(
278
278
  from sky.jobs.server import core as managed_jobs_core
279
279
  try:
280
280
  filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
281
- refresh=False, skip_finished=True, all_users=True)
281
+ refresh=False,
282
+ skip_finished=True,
283
+ all_users=True,
284
+ fields=['job_id', 'user_hash', 'workspace'])
282
285
  return filtered_jobs
283
286
  except exceptions.ClusterNotUpError:
284
287
  logger.warning('All jobs should be finished.')
@@ -181,57 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
181
181
 
182
182
 
183
183
  def format_resource(resource: 'resources_lib.Resources',
184
- simplify: bool = False) -> str:
184
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
185
  resource = resource.assert_launchable()
186
- vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
187
- resource.instance_type)
186
+ is_k8s = str(resource.cloud).lower() == 'kubernetes'
187
+ if resource.accelerators is None or is_k8s or not simplified_only:
188
+ vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
+ resource.instance_type)
188
190
 
189
- components = []
191
+ elements_simple = []
192
+ elements_full = []
190
193
 
191
194
  if resource.accelerators is not None:
192
195
  acc, count = list(resource.accelerators.items())[0]
193
- components.append(f'gpus={acc}:{count}')
196
+ elements_simple.append(f'gpus={acc}:{count}')
197
+ elements_full.append(f'gpus={acc}:{count}')
194
198
 
195
- is_k8s = str(resource.cloud).lower() == 'kubernetes'
196
- if (resource.accelerators is None or is_k8s or not simplify):
199
+ if (resource.accelerators is None or is_k8s):
200
+ if vcpu is not None:
201
+ elements_simple.append(f'cpus={int(vcpu)}')
202
+ elements_full.append(f'cpus={int(vcpu)}')
203
+ if mem is not None:
204
+ elements_simple.append(f'mem={int(mem)}')
205
+ elements_full.append(f'mem={int(mem)}')
206
+ elif not simplified_only:
197
207
  if vcpu is not None:
198
- components.append(f'cpus={int(vcpu)}')
208
+ elements_full.append(f'cpus={int(vcpu)}')
199
209
  if mem is not None:
200
- components.append(f'mem={int(mem)}')
210
+ elements_full.append(f'mem={int(mem)}')
201
211
 
202
- instance_type = resource.instance_type
203
- if simplify:
204
- instance_type = common_utils.truncate_long_string(instance_type, 15)
205
212
  if not is_k8s:
206
- components.append(instance_type)
207
- if simplify:
208
- components.append('...')
209
- else:
213
+ instance_type_full = resource.instance_type
214
+ instance_type_simple = common_utils.truncate_long_string(
215
+ instance_type_full, 15)
216
+ elements_simple.append(instance_type_simple)
217
+ elements_full.append(instance_type_full)
218
+ elements_simple.append('...')
219
+ if not simplified_only:
210
220
  image_id = resource.image_id
211
221
  if image_id is not None:
212
222
  if None in image_id:
213
- components.append(f'image_id={image_id[None]}')
223
+ elements_full.append(f'image_id={image_id[None]}')
214
224
  else:
215
- components.append(f'image_id={image_id}')
216
- components.append(f'disk={resource.disk_size}')
225
+ elements_full.append(f'image_id={image_id}')
226
+ elements_full.append(f'disk={resource.disk_size}')
217
227
  disk_tier = resource.disk_tier
218
228
  if disk_tier is not None:
219
- components.append(f'disk_tier={disk_tier.value}')
229
+ elements_full.append(f'disk_tier={disk_tier.value}')
220
230
  ports = resource.ports
221
231
  if ports is not None:
222
- components.append(f'ports={ports}')
232
+ elements_full.append(f'ports={ports}')
223
233
 
224
234
  spot = '[spot]' if resource.use_spot else ''
225
- return f'{spot}({"" if not components else ", ".join(components)})'
226
-
227
-
228
- def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
229
- simplify: bool = False) -> str:
235
+ resources_str_simple = (
236
+ f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
237
+ if simplified_only:
238
+ return resources_str_simple, None
239
+ else:
240
+ resources_str_full = (
241
+ f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
242
+ return resources_str_simple, resources_str_full
243
+
244
+
245
+ def get_readable_resources_repr(
246
+ handle: 'backends.CloudVmRayResourceHandle',
247
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
248
+ resource_str_simple, resource_str_full = format_resource(
249
+ handle.launched_resources, simplified_only)
250
+ if not simplified_only:
251
+ assert resource_str_full is not None
230
252
  if (handle.launched_nodes is not None and
231
253
  handle.launched_resources is not None):
232
- return (f'{handle.launched_nodes}x'
233
- f'{format_resource(handle.launched_resources, simplify)}')
234
- return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
254
+ return (f'{handle.launched_nodes}x{resource_str_simple}',
255
+ None if simplified_only else
256
+ f'{handle.launched_nodes}x{resource_str_full}')
257
+ return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
258
+ _DEFAULT_MESSAGE_HANDLE_INITIALIZING)
235
259
 
236
260
 
237
261
  def make_ray_custom_resources_str(
sky/utils/schemas.py CHANGED
@@ -1190,7 +1190,13 @@ def get_config_schema():
1190
1190
  'consolidation_mode': {
1191
1191
  'type': 'boolean',
1192
1192
  'default': False,
1193
- }
1193
+ },
1194
+ 'controller_logs_gc_retention_hours': {
1195
+ 'type': 'integer',
1196
+ },
1197
+ 'task_logs_gc_retention_hours': {
1198
+ 'type': 'integer',
1199
+ },
1194
1200
  },
1195
1201
  },
1196
1202
  'bucket': {
@@ -1592,10 +1598,10 @@ def get_config_schema():
1592
1598
 
1593
1599
  allowed_workspace_cloud_names = list(constants.ALL_CLOUDS) + ['cloudflare']
1594
1600
  # Create pattern for not supported clouds, i.e.
1595
- # all clouds except gcp, kubernetes, ssh
1601
+ # all clouds except aws, gcp, kubernetes, ssh, nebius
1596
1602
  not_supported_clouds = [
1597
1603
  cloud for cloud in allowed_workspace_cloud_names
1598
- if cloud.lower() not in ['gcp', 'kubernetes', 'ssh', 'nebius']
1604
+ if cloud.lower() not in ['aws', 'gcp', 'kubernetes', 'ssh', 'nebius']
1599
1605
  ]
1600
1606
  not_supported_cloud_regex = '|'.join(not_supported_clouds)
1601
1607
  workspaces_schema = {
@@ -1606,7 +1612,8 @@ def get_config_schema():
1606
1612
  'type': 'object',
1607
1613
  'additionalProperties': False,
1608
1614
  'patternProperties': {
1609
- # Pattern for non-GCP clouds - only allows 'disabled' property
1615
+ # Pattern for clouds with no workspace-specific config -
1616
+ # only allow 'disabled' property.
1610
1617
  f'^({not_supported_cloud_regex})$': {
1611
1618
  'type': 'object',
1612
1619
  'additionalProperties': False,
@@ -1641,6 +1648,18 @@ def get_config_schema():
1641
1648
  },
1642
1649
  'additionalProperties': False,
1643
1650
  },
1651
+ 'aws': {
1652
+ 'type': 'object',
1653
+ 'properties': {
1654
+ 'profile': {
1655
+ 'type': 'string'
1656
+ },
1657
+ 'disabled': {
1658
+ 'type': 'boolean'
1659
+ },
1660
+ },
1661
+ 'additionalProperties': False,
1662
+ },
1644
1663
  'ssh': {
1645
1664
  'type': 'object',
1646
1665
  'required': [],