skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/skylet/services.py CHANGED
@@ -1,11 +1,19 @@
1
1
  """gRPC service implementations for skylet."""
2
2
 
3
+ import os
4
+
3
5
  import grpc
4
6
 
5
7
  from sky import sky_logging
8
+ from sky.jobs import state as managed_job_state
6
9
  from sky.schemas.generated import autostopv1_pb2
7
10
  from sky.schemas.generated import autostopv1_pb2_grpc
11
+ from sky.schemas.generated import jobsv1_pb2
12
+ from sky.schemas.generated import jobsv1_pb2_grpc
13
+ from sky.serve import serve_state
8
14
  from sky.skylet import autostop_lib
15
+ from sky.skylet import constants
16
+ from sky.skylet import job_lib
9
17
 
10
18
  logger = sky_logging.init_logger(__name__)
11
19
 
@@ -42,3 +50,198 @@ class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
42
50
  is_autostopping=is_autostopping)
43
51
  except Exception as e: # pylint: disable=broad-except
44
52
  context.abort(grpc.StatusCode.INTERNAL, str(e))
53
+
54
+
55
+ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
56
+ """Implementation of the JobsService gRPC service."""
57
+
58
+ def AddJob( # type: ignore[return]
59
+ self, request: jobsv1_pb2.AddJobRequest,
60
+ context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
61
+ try:
62
+ job_name = request.job_name if request.HasField('job_name') else '-'
63
+ job_id, log_dir = job_lib.add_job(job_name, request.username,
64
+ request.run_timestamp,
65
+ request.resources_str,
66
+ request.metadata)
67
+ return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
68
+ except Exception as e: # pylint: disable=broad-except
69
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
70
+
71
+ def QueueJob( # type: ignore[return]
72
+ self, request: jobsv1_pb2.QueueJobRequest,
73
+ context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
74
+ try:
75
+ job_id = request.job_id
76
+ # Create log directory and file
77
+ remote_log_dir = os.path.expanduser(request.remote_log_dir)
78
+ os.makedirs(remote_log_dir, exist_ok=True)
79
+ remote_log_path = os.path.join(remote_log_dir, 'run.log')
80
+ open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
81
+
82
+ script_path = os.path.expanduser(request.script_path)
83
+ os.makedirs(os.path.dirname(script_path), exist_ok=True)
84
+
85
+ # If `codegen` is not provided, assume script is already
86
+ # uploaded to `script_path` via rsync.
87
+ if request.HasField('codegen'):
88
+ with open(script_path, 'w', encoding='utf-8') as f:
89
+ f.write(request.codegen)
90
+ os.chmod(script_path, 0o755)
91
+
92
+ cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
93
+ job_submit_cmd = (
94
+ # JOB_CMD_IDENTIFIER is used for identifying the process
95
+ # retrieved with pid is the same driver process.
96
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
97
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
98
+ # Do not use &>, which is not POSIX and may not work.
99
+ # Note that the order of ">filename 2>&1" matters.
100
+ f' > {remote_log_path} 2>&1')
101
+ job_lib.scheduler.queue(job_id, job_submit_cmd)
102
+
103
+ if request.HasField('managed_job'):
104
+ managed_job = request.managed_job
105
+ pool = managed_job.pool if managed_job.HasField(
106
+ 'pool') else None
107
+ pool_hash = None
108
+ if pool is not None:
109
+ pool_hash = serve_state.get_service_hash(pool)
110
+ # Add the managed job to job queue database.
111
+ managed_job_state.set_job_info(job_id, managed_job.name,
112
+ managed_job.workspace,
113
+ managed_job.entrypoint, pool,
114
+ pool_hash)
115
+ # Set the managed job to PENDING state to make sure that
116
+ # this managed job appears in the `sky jobs queue`, even
117
+ # if it needs to wait to be submitted.
118
+ # We cannot set the managed job to PENDING state in the
119
+ # job template (jobs-controller.yaml.j2), as it may need
120
+ # to wait for the run commands to be scheduled on the job
121
+ # controller in high-load cases.
122
+ for task in managed_job.tasks:
123
+ managed_job_state.set_pending(job_id, task.task_id,
124
+ task.name, task.resources_str,
125
+ task.metadata_json)
126
+ return jobsv1_pb2.QueueJobResponse()
127
+ except Exception as e: # pylint: disable=broad-except
128
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
129
+
130
+ def UpdateStatus( # type: ignore[return]
131
+ self, request: jobsv1_pb2.UpdateStatusRequest,
132
+ context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
133
+ try:
134
+ job_lib.update_status()
135
+ return jobsv1_pb2.UpdateStatusResponse()
136
+ except Exception as e: # pylint: disable=broad-except
137
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
138
+
139
+ def GetJobQueue( # type: ignore[return]
140
+ self, request: jobsv1_pb2.GetJobQueueRequest,
141
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
142
+ try:
143
+ user_hash = request.user_hash if request.HasField(
144
+ 'user_hash') else None
145
+ all_jobs = request.all_jobs
146
+ jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
147
+ all_jobs=all_jobs)
148
+ return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
149
+ except Exception as e: # pylint: disable=broad-except
150
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
151
+
152
+ def CancelJobs( # type: ignore[return]
153
+ self, request: jobsv1_pb2.CancelJobsRequest,
154
+ context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
155
+ try:
156
+ job_ids = list(request.job_ids) if request.job_ids else []
157
+ user_hash = request.user_hash if request.HasField(
158
+ 'user_hash') else None
159
+ cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
160
+ user_hash)
161
+ return jobsv1_pb2.CancelJobsResponse(
162
+ cancelled_job_ids=cancelled_job_ids)
163
+ except Exception as e: # pylint: disable=broad-except
164
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
165
+
166
+ def FailAllInProgressJobs( # type: ignore[return]
167
+ self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
168
+ context: grpc.ServicerContext
169
+ ) -> jobsv1_pb2.FailAllInProgressJobsResponse:
170
+ try:
171
+ job_lib.fail_all_jobs_in_progress()
172
+ return jobsv1_pb2.FailAllInProgressJobsResponse()
173
+ except Exception as e: # pylint: disable=broad-except
174
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
175
+
176
+ def TailLogs(
177
+ self,
178
+ request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
179
+ context: grpc.ServicerContext):
180
+ # TODO(kevin): implement this
181
+ raise NotImplementedError('TailLogs is not implemented')
182
+
183
+ def GetJobStatus( # type: ignore[return]
184
+ self, request: jobsv1_pb2.GetJobStatusRequest,
185
+ context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
186
+ try:
187
+ if request.job_ids:
188
+ job_ids = list(request.job_ids)
189
+ else:
190
+ latest_job_id = job_lib.get_latest_job_id()
191
+ job_ids = [latest_job_id] if latest_job_id is not None else []
192
+ job_statuses = job_lib.get_statuses(job_ids)
193
+ for job_id, status in job_statuses.items():
194
+ job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
195
+ ) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
196
+ return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
197
+ except Exception as e: # pylint: disable=broad-except
198
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
199
+
200
+ def GetJobSubmittedTimestamp( # type: ignore[return]
201
+ self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
202
+ context: grpc.ServicerContext
203
+ ) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
204
+ try:
205
+ job_id = request.job_id if request.HasField(
206
+ 'job_id') else job_lib.get_latest_job_id()
207
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
208
+ job_id, False)
209
+ if timestamp is None:
210
+ context.abort(grpc.StatusCode.NOT_FOUND,
211
+ f'Job {job_id} not found')
212
+ return jobsv1_pb2.GetJobSubmittedTimestampResponse(
213
+ timestamp=timestamp)
214
+ except Exception as e: # pylint: disable=broad-except
215
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
216
+
217
+ def GetJobEndedTimestamp( # type: ignore[return]
218
+ self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
219
+ context: grpc.ServicerContext
220
+ ) -> jobsv1_pb2.GetJobEndedTimestampResponse:
221
+ try:
222
+ job_id = request.job_id if request.HasField(
223
+ 'job_id') else job_lib.get_latest_job_id()
224
+ timestamp = job_lib.get_job_submitted_or_ended_timestamp(
225
+ job_id, True)
226
+ if timestamp is None:
227
+ context.abort(grpc.StatusCode.NOT_FOUND,
228
+ f'Job {job_id} not found or not ended')
229
+ return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
230
+ except Exception as e: # pylint: disable=broad-except
231
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
232
+
233
+ def GetLogDirsForJobs( # type: ignore[return]
234
+ self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
235
+ context: grpc.ServicerContext
236
+ ) -> jobsv1_pb2.GetLogDirsForJobsResponse:
237
+ try:
238
+ if request.job_ids:
239
+ job_ids = list(request.job_ids)
240
+ else:
241
+ latest_job_id = job_lib.get_latest_job_id()
242
+ job_ids = [latest_job_id] if latest_job_id is not None else []
243
+ job_log_dirs = job_lib.get_job_log_dirs(job_ids)
244
+ return jobsv1_pb2.GetLogDirsForJobsResponse(
245
+ job_log_dirs=job_log_dirs)
246
+ except Exception as e: # pylint: disable=broad-except
247
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
sky/skylet/skylet.py CHANGED
@@ -9,6 +9,7 @@ import grpc
9
9
  import sky
10
10
  from sky import sky_logging
11
11
  from sky.schemas.generated import autostopv1_pb2_grpc
12
+ from sky.schemas.generated import jobsv1_pb2_grpc
12
13
  from sky.skylet import constants
13
14
  from sky.skylet import events
14
15
  from sky.skylet import services
@@ -50,6 +51,9 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
50
51
  autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
51
52
  services.AutostopServiceImpl(), server)
52
53
 
54
+ jobsv1_pb2_grpc.add_JobsServiceServicer_to_server(
55
+ services.JobsServiceImpl(), server)
56
+
53
57
  listen_addr = f'127.0.0.1:{port}'
54
58
  server.add_insecure_port(listen_addr)
55
59
 
sky/task.py CHANGED
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
20
20
  from sky.serve import service_spec
21
21
  from sky.skylet import constants
22
22
  from sky.utils import common_utils
23
+ from sky.utils import git
23
24
  from sky.utils import registry
24
25
  from sky.utils import schemas
25
26
  from sky.utils import ux_utils
@@ -1596,6 +1597,67 @@ class Task:
1596
1597
  d[k] = v
1597
1598
  return d
1598
1599
 
1600
+ def update_workdir(self, workdir: Optional[str], git_url: Optional[str],
1601
+ git_ref: Optional[str]) -> 'Task':
1602
+ """Updates the task workdir.
1603
+
1604
+ Args:
1605
+ workdir: The workdir to update.
1606
+ git_url: The git url to update.
1607
+ git_ref: The git ref to update.
1608
+ """
1609
+ if self.workdir is None or isinstance(self.workdir, str):
1610
+ if workdir is not None:
1611
+ self.workdir = workdir
1612
+ return self
1613
+ if git_url is not None:
1614
+ self.workdir = {}
1615
+ self.workdir['url'] = git_url
1616
+ if git_ref is not None:
1617
+ self.workdir['ref'] = git_ref
1618
+ return self
1619
+ return self
1620
+ if git_url is not None:
1621
+ self.workdir['url'] = git_url
1622
+ if git_ref is not None:
1623
+ self.workdir['ref'] = git_ref
1624
+ return self
1625
+
1626
+ def update_envs_and_secrets_from_workdir(self) -> 'Task':
1627
+ """Updates the task envs and secrets from the workdir."""
1628
+ if self.workdir is None:
1629
+ return self
1630
+ if not isinstance(self.workdir, dict):
1631
+ return self
1632
+ url = self.workdir['url']
1633
+ ref = self.workdir.get('ref', '')
1634
+ token = os.environ.get(git.GIT_TOKEN_ENV_VAR)
1635
+ ssh_key_path = os.environ.get(git.GIT_SSH_KEY_PATH_ENV_VAR)
1636
+ try:
1637
+ git_repo = git.GitRepo(url, ref, token, ssh_key_path)
1638
+ clone_info = git_repo.get_repo_clone_info()
1639
+ if clone_info is None:
1640
+ return self
1641
+ self.envs[git.GIT_URL_ENV_VAR] = clone_info.url
1642
+ if ref:
1643
+ ref_type = git_repo.get_ref_type()
1644
+ if ref_type == git.GitRefType.COMMIT:
1645
+ self.envs[git.GIT_COMMIT_HASH_ENV_VAR] = ref
1646
+ elif ref_type == git.GitRefType.BRANCH:
1647
+ self.envs[git.GIT_BRANCH_ENV_VAR] = ref
1648
+ elif ref_type == git.GitRefType.TAG:
1649
+ self.envs[git.GIT_TAG_ENV_VAR] = ref
1650
+ if clone_info.token is None and clone_info.ssh_key is None:
1651
+ return self
1652
+ if clone_info.token is not None:
1653
+ self.secrets[git.GIT_TOKEN_ENV_VAR] = clone_info.token
1654
+ if clone_info.ssh_key is not None:
1655
+ self.secrets[git.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
1656
+ except exceptions.GitError as e:
1657
+ with ux_utils.print_exception_no_traceback():
1658
+ raise ValueError(f'{str(e)}') from None
1659
+ return self
1660
+
1599
1661
  def to_yaml_config(self,
1600
1662
  use_user_specified_yaml: bool = False) -> Dict[str, Any]:
1601
1663
  """Returns a yaml-style dict representation of the task.
@@ -654,8 +654,125 @@ available_node_types:
654
654
  # after v0.11.0 release.
655
655
  touch /tmp/apt_ssh_setup_started
656
656
 
657
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
658
- echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
657
+ # Helper: run apt-get update with retries
658
+ apt_update_with_retries() {
659
+ # do not fail the whole shell; we handle return codes
660
+ set +e
661
+ local log=/tmp/apt-update.log
662
+ local tries=3
663
+ local delay=1
664
+ local i
665
+ for i in $(seq 1 $tries); do
666
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
667
+ echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
668
+ sleep $delay
669
+ delay=$((delay * 2))
670
+ done
671
+ set -e
672
+ return 1
673
+ }
674
+ apt_install_with_retries() {
675
+ local packages="$@"
676
+ [ -z "$packages" ] && return 0
677
+ set +e
678
+ local log=/tmp/apt-update.log
679
+ local tries=3
680
+ local delay=1
681
+ local i
682
+ for i in $(seq 1 $tries); do
683
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
684
+ echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
685
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
686
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
687
+ sleep $delay
688
+ delay=$((delay * 2))
689
+ done
690
+ set -e
691
+ return 1
692
+ }
693
+ apt_update_install_with_retries() {
694
+ apt_update_with_retries
695
+ apt_install_with_retries "$@"
696
+ }
697
+ backup_dir=/etc/apt/sources.list.backup_skypilot
698
+ backup_source() {
699
+ $(prefix_cmd) mkdir -p "$backup_dir"
700
+ if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
701
+ $(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
702
+ fi
703
+ }
704
+ restore_source() {
705
+ if [ -f "$backup_dir/sources.list" ]; then
706
+ $(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
707
+ fi
708
+ }
709
+ update_apt_sources() {
710
+ local host=$1
711
+ local apt_file=$2
712
+ $(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
713
+ }
714
+ # Helper: install packages across mirrors with retries
715
+ apt_install_with_mirrors() {
716
+ local required=$1; shift
717
+ local packages="$@"
718
+ [ -z "$packages" ] && return 0
719
+ set +e
720
+ # Install packages with default sources first
721
+ local log=/tmp/apt-update.log
722
+ echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
723
+ restore_source
724
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
725
+ echo "Install failed with default sources: $packages" >> "$log"
726
+ # Detect distro (ubuntu/debian)
727
+ local APT_OS="unknown"
728
+ if [ -f /etc/os-release ]; then
729
+ . /etc/os-release
730
+ case "$ID" in
731
+ debian) APT_OS="debian" ;;
732
+ ubuntu) APT_OS="ubuntu" ;;
733
+ *)
734
+ if [ -n "$ID_LIKE" ]; then
735
+ case " $ID $ID_LIKE " in
736
+ *ubuntu*) APT_OS="ubuntu" ;;
737
+ *debian*) APT_OS="debian" ;;
738
+ esac
739
+ fi
740
+ ;;
741
+ esac
742
+ fi
743
+ # Build mirror candidates
744
+ # deb.debian.org is a CDN endpoint, if one backend goes down,
745
+ # the CDN automatically fails over to another mirror,
746
+ # so we only retry for ubuntu here.
747
+ if [ "$APT_OS" = "ubuntu" ]; then
748
+ # Backup current sources once
749
+ backup_source
750
+ # Selected from https://launchpad.net/ubuntu/+archivemirrors
751
+ # and results from apt-select
752
+ local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
753
+ for host in $MIRROR_CANDIDATES; do
754
+ echo "Trying APT mirror ($APT_OS): $host" >> "$log"
755
+ if [ -f /etc/apt/sources.list ]; then
756
+ update_apt_sources $host /etc/apt/sources.list
757
+ else
758
+ echo "Error: /etc/apt/sources.list not found" >> "$log"
759
+ break
760
+ fi
761
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
762
+ echo "Install failed with mirror ($APT_OS): $host" >> "$log"
763
+ # Restore to default sources
764
+ restore_source
765
+ done
766
+ fi
767
+ set -e
768
+ if [ "$required" = "1" ]; then
769
+ echo "Error: required package install failed across all mirrors: $packages" >> "$log"
770
+ return 1
771
+ else
772
+ echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
773
+ return 0
774
+ fi
775
+ }
659
776
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
660
777
  # so that both fusemount and fusermount3 can be masked before enabling SSH access.
661
778
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
@@ -682,7 +799,7 @@ available_node_types:
682
799
  done;
683
800
  if [ ! -z "$INSTALL_FIRST" ]; then
684
801
  echo "Installing core packages: $INSTALL_FIRST";
685
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
802
+ apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
686
803
  fi;
687
804
  # SSH and other packages are not necessary, so we disable set -e
688
805
  set +e
@@ -0,0 +1,108 @@
1
+ cluster_name: {{ cluster_name_on_cloud }}
2
+
3
+ max_workers: {{ num_nodes - 1 }}
4
+ upscaling_speed: {{ num_nodes - 1 }}
5
+ idle_timeout_minutes: 5
6
+
7
+ provider:
8
+ type: external
9
+ module: sky.provision.seeweb
10
+ region: "{{ region }}"
11
+
12
+ auth:
13
+ ssh_user: ecuser
14
+ ssh_private_key: {{ ssh_private_key }}
15
+
16
+ available_node_types:
17
+ ray_head_default:
18
+ resources: {}
19
+ node_config:
20
+ plan: {{ instance_type }}
21
+ image: {{ image_id }}
22
+ location: {{ region }}
23
+ {% if seeweb_gpu_config is not none %}
24
+ gpu: {{ seeweb_gpu_config.gpu }}
25
+ gpu_label: "{{ seeweb_gpu_config.gpu_label }}"
26
+ {% endif %}
27
+ disk: {{ disk_size }}
28
+
29
+ head_node_type: ray_head_default
30
+
31
+ file_mounts: {
32
+ "~/.seeweb_cloud/seeweb_keys": "~/.seeweb_cloud/seeweb_keys",
33
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
34
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
35
+ {%- for remote_path, local_path in credentials.items() %}
36
+ "{{remote_path}}": "{{local_path}}",
37
+ {%- endfor %}
38
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
39
+ }
40
+
41
+ rsync_exclude: []
42
+
43
+ setup_commands:
44
+ - |
45
+ touch ~/.bashrc;
46
+ echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts || true;
47
+ echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts || true;
48
+ sudo systemctl stop unattended-upgrades || true;
49
+ sudo systemctl disable unattended-upgrades || true;
50
+ sudo apt update && sudo apt install -y patch || sudo yum install -y patch || true;
51
+ {{ conda_installation_commands }}
52
+ {{ ray_skypilot_installation_commands }}
53
+
54
+ head_start_ray_commands:
55
+ - |
56
+ retry_ray() {
57
+ local n=0; local max=30
58
+ until [ $n -ge $max ]; do
59
+ export SKYPILOT_NUM_GPUS=0
60
+ command -v nvidia-smi >/dev/null 2>&1 && \
61
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
62
+
63
+ ray stop || true
64
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
65
+ ray start --disable-usage-stats --head \
66
+ --port={{ ray_port }} --dashboard-port={{ ray_dashboard_port }} \
67
+ --object-manager-port=8076 \
68
+ --autoscaling-config=~/ray_bootstrap_config.yaml \
69
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
70
+
71
+ echo "[head] Ray failed to start ($((++n))/$max), retrying in 5s..."
72
+ sleep 5
73
+ done
74
+ [ $n -eq $max ] && { echo "Ray head failed"; exit 1; }
75
+ }
76
+ retry_ray
77
+
78
+ worker_start_ray_commands:
79
+ - |
80
+ retry_ray() {
81
+ local n=0; local max=30
82
+ until [ $n -ge $max ]; do
83
+ SKYPILOT_NUM_GPUS=0
84
+ command -v nvidia-smi >/dev/null 2>&1 && \
85
+ SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
86
+
87
+ ray stop || true
88
+ RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 \
89
+ ray start --disable-usage-stats \
90
+ --address=$RAY_HEAD_IP:{{ ray_port }} \
91
+ --object-manager-port=8076 \
92
+ --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ ray_temp_dir }} && break
93
+
94
+ echo "[worker] Ray failed to start ($((++n))/$max), retrying in 5s..."
95
+ sleep 5
96
+ done
97
+ [ $n -eq $max ] && { echo "Ray worker failed"; exit 1; }
98
+ }
99
+ retry_ray
100
+
101
+ head_node: {}
102
+ worker_nodes: {}
103
+
104
+ head_setup_commands: []
105
+ worker_setup_commands: []
106
+
107
+ cluster_synced_files: []
108
+ file_mounts_sync_continuously: False
@@ -107,10 +107,12 @@ def canonicalize_accelerator_name(accelerator: str,
107
107
  if not names and cloud_str in ['Kubernetes', None]:
108
108
  with rich_utils.safe_status(
109
109
  ux_utils.spinner_message('Listing accelerators on Kubernetes')):
110
+ # Only search for Kubernetes to reduce the lookup cost.
111
+ # For other clouds, the catalog has been searched in previous steps.
110
112
  searched = catalog.list_accelerators(
111
113
  name_filter=accelerator,
112
114
  case_sensitive=False,
113
- clouds=cloud_str,
115
+ clouds='Kubernetes',
114
116
  )
115
117
  names = list(searched.keys())
116
118
  if accelerator in names: