skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,806 @@
1
+ """Seeweb provisioner for SkyPilot / Ray autoscaler.
2
+
3
+ Prerequisites:
4
+ pip install ecsapi
5
+ """
6
+
7
+ import os
8
+ import subprocess
9
+ import time
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ from sky import authentication as auth
13
+ from sky import sky_logging
14
+ from sky.adaptors import seeweb as seeweb_adaptor
15
+ from sky.provision import common
16
+ from sky.provision.common import ClusterInfo
17
+ from sky.provision.common import InstanceInfo
18
+ from sky.provision.common import ProvisionConfig
19
+ from sky.provision.common import ProvisionRecord
20
+ from sky.utils import command_runner # Unified SSH helper
21
+ from sky.utils import common_utils
22
+ from sky.utils import status_lib
23
+
24
+ logger = sky_logging.init_logger(__name__)
25
+
26
+ # Singleton Seeweb client reused across the module
27
+ _seeweb_client = None
28
+
29
+
30
+ def _get_seeweb_client():
31
+ """Return a singleton Seeweb ECS API client."""
32
+ global _seeweb_client
33
+ if _seeweb_client is None:
34
+ # Initialize via adaptor's cached client
35
+ _seeweb_client = seeweb_adaptor.client()
36
+ return _seeweb_client
37
+
38
+
39
+ # --------------------------------------------------------------------------- #
40
+ # Useful constants
41
+ # --------------------------------------------------------------------------- #
42
+ _POLL_INTERVAL = 5 # sec
43
+ _MAX_BOOT_TIME = 1200 # sec
44
+ _ACTION_WATCH_MAX_RETRY = 360 # number of polls before giving up
45
+ _ACTION_WATCH_FETCH_EVERY = 5 # seconds between polls
46
+ _API_RETRY_MAX_RETRIES = 5
47
+ _API_RETRY_INITIAL_BACKOFF = 1
48
+
49
+
50
+ # --------------------------------------------------------------------------- #
51
+ # Class required by the Ray backend
52
+ # --------------------------------------------------------------------------- #
53
+ class SeewebNodeProvider:
54
+ """Minimalist provisioner for Seeweb ECS."""
55
+
56
+ def __init__(self, provider_config: ProvisionConfig, cluster_name: str):
57
+ """provider_config: dict populated by template (plan, image, location,
58
+ remote_key_name, optional gpu…)
59
+ cluster_name : SkyPilot name on cloud (used in notes)
60
+ """
61
+ self.config = provider_config
62
+ self.cluster_name = cluster_name
63
+ # Reuse a singleton Seeweb client to avoid repeated authentications/API
64
+ # object creations across different provider instances.
65
+ self.ecs = _get_seeweb_client()
66
+
67
+ def _get_ssh_user(self) -> str:
68
+ # Prefer auth config; fallback to template default for Seeweb
69
+ return (self.config.authentication_config.get('ssh_user') if self.config
70
+ and self.config.authentication_config else None) or 'ecuser'
71
+
72
+ def _get_private_key_path(self) -> str:
73
+ # Prefer explicit path from auth config; otherwise use SkyPilot key
74
+ key_path = None
75
+ if self.config and self.config.authentication_config:
76
+ key_path = self.config.authentication_config.get('ssh_private_key')
77
+ if not key_path:
78
+ key_path, _ = auth.get_or_generate_keys()
79
+ return os.path.expanduser(key_path)
80
+
81
+ # ------------------------------------------------------------------ #
82
+ # Helper: run a command on the VM via SSH using CommandRunner
83
+ # ------------------------------------------------------------------ #
84
+ def _run_remote(self,
85
+ server_ip: str,
86
+ cmd: str,
87
+ *,
88
+ timeout: int = 30,
89
+ stream_logs: bool = False) -> subprocess.CompletedProcess:
90
+ """Execute *cmd* on the remote host.
91
+
92
+ Uses sky.utils.command_runner.SSHCommandRunner for consistent SSH
93
+ options across all providers.
94
+ Returns a subprocess.CompletedProcess-like
95
+ object with returncode, stdout, stderr.
96
+ """
97
+ runner = command_runner.SSHCommandRunner(
98
+ node=(server_ip, 22),
99
+ ssh_user=self._get_ssh_user(),
100
+ ssh_private_key=self._get_private_key_path(),
101
+ )
102
+ rc, stdout, stderr = runner.run(cmd,
103
+ stream_logs=stream_logs,
104
+ require_outputs=True,
105
+ connect_timeout=timeout)
106
+ # Convert to simple namespace for compatibility
107
+ proc = subprocess.CompletedProcess(args=cmd,
108
+ returncode=rc,
109
+ stdout=stdout.encode(),
110
+ stderr=stderr.encode())
111
+ return proc
112
+
113
+ # --------------------------------------------------------------------- #
114
+ # 1. bootstrap_instances – no preprocessing needed here
115
+ # --------------------------------------------------------------------- #
116
+
117
+ # --------------------------------------------------------------------- #
118
+ # 2. run_instances: restart or create until we reach count
119
+ # --------------------------------------------------------------------- #
120
+ def run_instances(self, config: Dict, count: int) -> None:
121
+ existing = self._query_cluster_nodes()
122
+ del config # unused
123
+ running = [
124
+ s for s in existing if s.status in ('Booted', 'Running', 'RUNNING',
125
+ 'Booting', 'PoweringOn')
126
+ ]
127
+
128
+ # a) restart Off servers
129
+ for srv in (s for s in existing if s.status == 'Booted'):
130
+ specific_status = self.ecs.fetch_server_status(srv.name)
131
+ if specific_status == 'SHUTOFF':
132
+ logger.info(f'Powering on server {srv.name}')
133
+ self._power_on(srv.name)
134
+ running.append(srv)
135
+ if len(running) >= count:
136
+ break
137
+
138
+ # b) create new VMs if missing
139
+ while len(running) < count:
140
+ self._create_server()
141
+ running.append({}) # placeholder
142
+
143
+ # --------------------------------------------------------------------- #
144
+ # 3. terminate_instances
145
+ # --------------------------------------------------------------------- #
146
+ def terminate_instances(self) -> None:
147
+ for srv in self._query_cluster_nodes():
148
+ logger.info('Deleting server %s …', srv.name)
149
+ self.ecs.delete_server(srv.name) # DELETE /servers/{name}
150
+
151
+ # Retry deletion with exponential backoff
152
+ # to handle transient API errors
153
+ common_utils.retry(self.ecs.delete_server,
154
+ max_retries=5,
155
+ initial_backoff=1)(srv.name)
156
+
157
+ # --------------------------------------------------------------------- #
158
+ # 4. stop_instances
159
+ # --------------------------------------------------------------------- #
160
+ def stop_instances(self) -> None:
161
+ cluster_nodes = self._query_cluster_nodes()
162
+
163
+ for srv in cluster_nodes:
164
+ specific_status = self.ecs.fetch_server_status(srv.name)
165
+
166
+ if specific_status == 'SHUTOFF':
167
+ logger.info(f'\nServer {srv.name} is already stopped\n')
168
+ continue
169
+ elif srv.status in ('Booted', 'Running', 'RUNNING'):
170
+ # Get specific status to check if server is not already SHUTOFF
171
+ try:
172
+ specific_status = self.ecs.fetch_server_status(srv.name)
173
+ # Continue with power off only if
174
+ # specific_status is not SHUTOFF
175
+ # and general status is not STOPPED
176
+ if specific_status != 'SHUTOFF' and srv.status != 'STOPPED':
177
+ self._power_off(srv.name)
178
+ except Exception: # pylint: disable=broad-except
179
+ # Fallback: if we can't get specific
180
+ # status, use general status check
181
+ if srv.status != 'STOPPED':
182
+ self._power_off(srv.name)
183
+ else:
184
+ logger.info(f'\nServer {srv.name} has status'
185
+ f'{srv.status}, skipping\n')
186
+ # Wait for all servers to be actually stopped with forced refresh
187
+ self._wait_for_stop_with_forced_refresh()
188
+
189
+ # --------------------------------------------------------------------- #
190
+ # 5. query_instances
191
+ # --------------------------------------------------------------------- #
192
+ def query_instances(self) -> Dict[str, str]:
193
+ """Query instances status using both fetch_servers()
194
+ and fetch_server_status().
195
+
196
+ Seeweb has two different APIs:
197
+ - fetch_servers() returns states like 'Booted', 'Booting'
198
+ - fetch_server_status() returns states like 'SHUTOFF' (stopped)
199
+
200
+ We need to use fetch_server_status() to get the correct stopped state.
201
+ """
202
+ instances = {}
203
+ cluster_nodes = self._query_cluster_nodes()
204
+
205
+ for server in cluster_nodes:
206
+ # Always try to get the specific status first for more accuracy
207
+ try:
208
+ specific_status = self.ecs.fetch_server_status(server.name)
209
+ instances[server.name] = specific_status
210
+ except Exception: # pylint: disable=broad-except
211
+ # Fallback to general status if fetch_server_status fails
212
+ general_status = server.status
213
+ instances[server.name] = general_status
214
+
215
+ return instances
216
+
217
+ # --------------------------------------------------------------------- #
218
+ # 6. wait_instances
219
+ # --------------------------------------------------------------------- #
220
+ def wait_instances(self, desired_state: str = 'Booted') -> None:
221
+ deadline = time.time() + _MAX_BOOT_TIME
222
+
223
+ while time.time() < deadline:
224
+ cluster_nodes = self._query_cluster_nodes()
225
+
226
+ # For SHUTOFF state, we need to use fetch_server_status()
227
+ # to get the real status
228
+ if desired_state == 'SHUTOFF':
229
+ all_shutoff = True
230
+ for server in cluster_nodes:
231
+ try:
232
+ specific_status = self.ecs.fetch_server_status(
233
+ server.name)
234
+ if specific_status != 'SHUTOFF':
235
+ all_shutoff = False
236
+ except Exception: # pylint: disable=broad-except
237
+ all_shutoff = False
238
+
239
+ if all_shutoff:
240
+ return
241
+ else:
242
+ # For other states, use the general status
243
+ states = {srv.status for srv in cluster_nodes}
244
+
245
+ if states <= {desired_state}:
246
+ # If all servers are Booted, wait
247
+ # for them to be truly stable
248
+ if desired_state == 'Booted':
249
+ if self._wait_for_all_servers_stable():
250
+ return
251
+ else:
252
+ time.sleep(_POLL_INTERVAL)
253
+ continue
254
+ return
255
+
256
+ time.sleep(_POLL_INTERVAL)
257
+
258
+ raise TimeoutError(
259
+ f'Nodes are not all in state {desired_state} within timeout')
260
+
261
+ def _wait_for_all_servers_stable(self, max_wait: int = 600) -> bool:
262
+ """Waits for all cluster servers to be stable."""
263
+ logger.info('Checking stability of all cluster servers...')
264
+
265
+ start_time = time.time()
266
+ while time.time() - start_time < max_wait:
267
+ cluster_nodes = self._query_cluster_nodes()
268
+ all_stable = True
269
+
270
+ for node in cluster_nodes:
271
+ if node.status == 'Booted':
272
+ # Check that server is reachable via ping
273
+ if not self._ping_server(node.ipv4):
274
+ logger.warning(f'Server {node.name} ({node.ipv4})'
275
+ f'not reachable via ping')
276
+ all_stable = False
277
+ break
278
+
279
+ # SSH readiness handled by provisioner.wait_for_ssh()
280
+
281
+ logger.info(f'Server {node.name} ({node.ipv4}) is stable')
282
+
283
+ if all_stable:
284
+ logger.info('All servers are stable')
285
+ # Safety sleep to allow for late reboots
286
+ logger.info('Waiting 1 second to allow for late reboots...')
287
+ time.sleep(1)
288
+ return True
289
+
290
+ logger.info('Waiting for all servers to be stable...')
291
+ time.sleep(1)
292
+
293
+ logger.error('Timeout waiting for server stability')
294
+ return False
295
+
296
+ def _ping_server(self, server_ip: str) -> bool:
297
+ """Check that server is reachable via ping."""
298
+ try:
299
+ result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
300
+ capture_output=True,
301
+ timeout=10,
302
+ check=False)
303
+ return result.returncode == 0
304
+ except Exception as e: # pylint: disable=broad-except
305
+ logger.debug(f'Error pinging {server_ip}: {e}')
306
+ return False
307
+
308
+ def _check_ssh_ready(self, server_ip: str) -> bool:
309
+ """Check that SSH is available on the server."""
310
+ try:
311
+ ssh_user = self._get_ssh_user()
312
+ private_key_path = self._get_private_key_path()
313
+ result = subprocess.run([
314
+ 'ssh', '-o', 'ConnectTimeout=10', '-o',
315
+ 'StrictHostKeyChecking=no', '-o',
316
+ f'UserKnownHostsFile={os.devnull}', '-o',
317
+ f'GlobalKnownHostsFile={os.devnull}', '-o',
318
+ 'IdentitiesOnly=yes', '-i', private_key_path,
319
+ f'{ssh_user}@{server_ip}', 'echo "SSH ready"'
320
+ ],
321
+ capture_output=True,
322
+ timeout=15,
323
+ check=False)
324
+ return result.returncode == 0
325
+ except Exception as e: # pylint: disable=broad-except
326
+ logger.debug(f'Error checking SSH on {server_ip}: {e}')
327
+ return False
328
+
329
+ # ------------------------------------------------------------------ #
330
+ # 7. open_ports / cleanup_ports – Seeweb has all ports open by default
331
+ # ------------------------------------------------------------------ #
332
+ def open_ports(
333
+ self,
334
+ cluster_name_on_cloud: str,
335
+ ports: List[str],
336
+ provider_config: Optional[Dict[str, Any]] = None,
337
+ ) -> None:
338
+ """See sky/provision/__init__.py"""
339
+ logger.debug(f'Skip opening ports {ports} for Seeweb instances, as all '
340
+ 'ports are open by default.')
341
+ del cluster_name_on_cloud, provider_config, ports
342
+
343
+ def cleanup_ports(
344
+ self,
345
+ cluster_name_on_cloud: str,
346
+ ports: List[str],
347
+ provider_config: Optional[Dict[str, Any]] = None,
348
+ ) -> None:
349
+ del cluster_name_on_cloud, ports, provider_config # Unused.
350
+
351
+ # ====================== private helpers ========================= #
352
+ def _query_cluster_nodes(self):
353
+ """List servers with notes == cluster_name."""
354
+ servers = common_utils.retry(
355
+ self.ecs.fetch_servers,
356
+ max_retries=_API_RETRY_MAX_RETRIES,
357
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
358
+ return [
359
+ s for s in servers
360
+ if s.notes and s.notes.startswith(self.cluster_name)
361
+ ]
362
+
363
+ def query_cluster_nodes(self):
364
+ """Public wrapper for querying cluster nodes for this cluster."""
365
+ return common_utils.retry(self._query_cluster_nodes,
366
+ max_retries=_API_RETRY_MAX_RETRIES,
367
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
368
+
369
+ def _get_head_instance_id(self) -> Optional[str]:
370
+ """Return head instance id for this cluster.
371
+
372
+ Prefer notes == "{cluster}-head"; fallback to first node if none
373
+ matches (legacy naming).
374
+ """
375
+ nodes = common_utils.retry(self._query_cluster_nodes,
376
+ max_retries=_API_RETRY_MAX_RETRIES,
377
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
378
+ for node in nodes:
379
+ try:
380
+ if getattr(node, 'notes', None) == f'{self.cluster_name}-head':
381
+ return node.name
382
+ if getattr(node, 'name', None) and node.name.endswith('-head'):
383
+ return node.name
384
+ except Exception: # pylint: disable=broad-except
385
+ continue
386
+ return nodes[0].name if nodes else None
387
+
388
+ def get_head_instance_id(self) -> Optional[str]:
389
+ """Public wrapper for getting head instance id."""
390
+ return common_utils.retry(self._get_head_instance_id,
391
+ max_retries=_API_RETRY_MAX_RETRIES,
392
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
393
+
394
+ def _create_server(self):
395
+ """POST /servers with complete payload."""
396
+ node_type = 'head'
397
+ payload = {
398
+ 'plan': self.config.node_config.get('plan'), # e.g. eCS4
399
+ 'image': self.config.node_config.get('image'), # e.g. ubuntu-2204
400
+ 'location': self.config.node_config.get('location'), # e.g. it-mi2
401
+ 'notes': f'{self.cluster_name}-{node_type}',
402
+ 'ssh_key': self.config.authentication_config.get('remote_key_name'
403
+ ), # remote key
404
+ }
405
+
406
+ # Optional GPU
407
+ if 'gpu' in self.config.node_config:
408
+ payload.update({
409
+ 'gpu': self.config.node_config.get('gpu'),
410
+ 'gpu_label': self.config.node_config.get('gpu_label', ''),
411
+ })
412
+
413
+ # Build the request object expected by ecsapi
414
+ server_create_request_cls = (
415
+ seeweb_adaptor.ecsapi.ServerCreateRequest # type: ignore
416
+ )
417
+ create_request = server_create_request_cls(**payload)
418
+
419
+ logger.info('Creating Seeweb server %s', payload)
420
+
421
+ # POST /servers – returns (response, action_id)
422
+ _, action_id = common_utils.retry(
423
+ self.ecs.create_server,
424
+ max_retries=_API_RETRY_MAX_RETRIES,
425
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(
426
+ create_request, check_if_can_create=False)
427
+ self.ecs.watch_action(action_id,
428
+ max_retry=_ACTION_WATCH_MAX_RETRY,
429
+ fetch_every=_ACTION_WATCH_FETCH_EVERY)
430
+
431
+ def _power_on(self, server_id: str):
432
+ try:
433
+ common_utils.retry(
434
+ self.ecs.turn_on_server,
435
+ max_retries=_API_RETRY_MAX_RETRIES,
436
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
437
+ except seeweb_adaptor.SeewebError as e:
438
+ logger.error(f'Error in _power_on for {server_id}: {e}')
439
+ raise
440
+
441
+ def _power_off(self, server_id: str):
442
+ try:
443
+ common_utils.retry(
444
+ self.ecs.turn_off_server,
445
+ max_retries=_API_RETRY_MAX_RETRIES,
446
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server_id)
447
+ except seeweb_adaptor.SeewebError as e:
448
+ logger.error(f'\n\nError in _power_off for {server_id}: {e}')
449
+ raise
450
+
451
+ def _wait_action(self, action_id: int):
452
+ """Poll action until it completes."""
453
+ while True:
454
+ action = common_utils.retry(
455
+ self.ecs.fetch_action,
456
+ max_retries=_API_RETRY_MAX_RETRIES,
457
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(action_id)
458
+ if action['status'] in ('completed', 'ok', 'no_content'):
459
+ return
460
+ if action['status'] == 'error':
461
+ raise RuntimeError(f'Seeweb action {action_id} failed')
462
+ time.sleep(_POLL_INTERVAL)
463
+
464
+ def _wait_for_stop_with_forced_refresh(self, max_wait: int = 300) -> None:
465
+ """Wait for servers to be stopped with
466
+ aggressive polling and forced refresh."""
467
+ start_time = time.time()
468
+ poll_interval = 1 # 1 second for aggressive polling
469
+
470
+ while time.time() - start_time < max_wait:
471
+ # Force refresh by re-fetching cluster nodes
472
+ cluster_nodes = common_utils.retry(
473
+ self._query_cluster_nodes,
474
+ max_retries=_API_RETRY_MAX_RETRIES,
475
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)()
476
+
477
+ all_stopped = True
478
+ for server in cluster_nodes:
479
+ try:
480
+ # Always use fetch_server_status() for accurate status
481
+ specific_status = common_utils.retry(
482
+ self.ecs.fetch_server_status,
483
+ max_retries=_API_RETRY_MAX_RETRIES,
484
+ initial_backoff=_API_RETRY_INITIAL_BACKOFF)(server.name)
485
+
486
+ if specific_status != 'SHUTOFF':
487
+ all_stopped = False
488
+
489
+ except Exception: # pylint: disable=broad-except
490
+ all_stopped = False
491
+
492
+ if all_stopped:
493
+ return
494
+
495
+ time.sleep(poll_interval)
496
+
497
+ raise TimeoutError(f'Servers not stopped within {max_wait} seconds')
498
+
499
+
500
+ # =============================================================================
501
+ # Standalone functions required by the provisioning interface
502
+ # =============================================================================
503
+
504
+
505
+ def run_instances(region: str, cluster_name_on_cloud: str,
506
+ config: ProvisionConfig) -> ProvisionRecord:
507
+ """Run instances for Seeweb cluster."""
508
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
509
+ provider.run_instances(config.node_config, config.count)
510
+
511
+ # Find the head node using notes convention
512
+ cluster_nodes = provider.query_cluster_nodes()
513
+ if not cluster_nodes:
514
+ raise RuntimeError(
515
+ f'No nodes found for cluster {cluster_name_on_cloud}')
516
+ head_node_id = provider.get_head_instance_id()
517
+ assert head_node_id is not None, 'head_instance_id should not be None'
518
+
519
+ return ProvisionRecord(
520
+ provider_name='Seeweb',
521
+ region=region,
522
+ zone=None, # Seeweb doesn't use zones
523
+ cluster_name=cluster_name_on_cloud,
524
+ head_instance_id=head_node_id,
525
+ resumed_instance_ids=[], # Empty for now
526
+ created_instance_ids=[node.name for node in cluster_nodes],
527
+ )
528
+
529
+
530
+ def stop_instances(
531
+ cluster_name_on_cloud: str,
532
+ provider_config: Optional[Dict[str, Any]] = None,
533
+ worker_only: bool = False,
534
+ ) -> None:
535
+ """Stop instances for Seeweb cluster."""
536
+ del worker_only # unused - Seeweb doesn't distinguish between head/worker
537
+ assert provider_config is not None
538
+
539
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
540
+ config = common.ProvisionConfig(
541
+ provider_config=provider_config,
542
+ authentication_config={},
543
+ docker_config={},
544
+ node_config=provider_config,
545
+ count=1, # Not used for stop operation
546
+ tags={},
547
+ resume_stopped_nodes=False,
548
+ ports_to_open_on_launch=None,
549
+ )
550
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
551
+ provider.stop_instances()
552
+
553
+
554
+ def terminate_instances(
555
+ cluster_name_on_cloud: str,
556
+ provider_config: Optional[Dict[str, Any]] = None,
557
+ worker_only: bool = False,
558
+ ) -> None:
559
+ """Terminate instances for Seeweb cluster."""
560
+ del worker_only # unused - Seeweb doesn't distinguish between head/worker
561
+ assert provider_config is not None
562
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
563
+ config = common.ProvisionConfig(
564
+ provider_config=provider_config,
565
+ authentication_config={},
566
+ docker_config={},
567
+ node_config=provider_config,
568
+ count=1, # Not used for terminate operation
569
+ tags={},
570
+ resume_stopped_nodes=False,
571
+ ports_to_open_on_launch=None,
572
+ )
573
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
574
+ provider.terminate_instances()
575
+
576
+
577
+ def wait_instances(
578
+ region: str,
579
+ cluster_name_on_cloud: str,
580
+ state: Optional[status_lib.ClusterStatus],
581
+ ) -> None:
582
+ del region # unused
583
+ # Map ClusterStatus to Seeweb string
584
+ if state == status_lib.ClusterStatus.UP:
585
+ seeweb_state = 'Booted'
586
+ elif state == status_lib.ClusterStatus.STOPPED:
587
+ seeweb_state = 'SHUTOFF'
588
+ elif state is None:
589
+ seeweb_state = 'Terminated' # For termination
590
+ else:
591
+ seeweb_state = 'Booted' # Default fallback
592
+
593
+ # Create Seeweb client directly and wait
594
+ client = _get_seeweb_client()
595
+ deadline = time.time() + _MAX_BOOT_TIME
596
+ while time.time() < deadline:
597
+ cluster_nodes = [
598
+ s for s in client.fetch_servers()
599
+ if s.notes and s.notes.startswith(cluster_name_on_cloud)
600
+ ]
601
+ if not cluster_nodes:
602
+ time.sleep(_POLL_INTERVAL)
603
+ continue
604
+
605
+ states = {srv.status for srv in cluster_nodes}
606
+ if states <= {seeweb_state}:
607
+ # If all servers are Booted, wait for them to be truly stable
608
+ if seeweb_state == 'Booted':
609
+ if _wait_for_all_servers_stable_standalone(cluster_nodes):
610
+ return
611
+ else:
612
+ time.sleep(_POLL_INTERVAL)
613
+ continue
614
+ return
615
+ time.sleep(_POLL_INTERVAL)
616
+
617
+ raise TimeoutError(
618
+ f'Nodes are not all in state {seeweb_state} within timeout')
619
+
620
+
621
+ def _wait_for_all_servers_stable_standalone(cluster_nodes,
622
+ max_wait: int = 300) -> bool:
623
+ """Waits for all cluster servers to be stable (standalone version)."""
624
+ start_time = time.time()
625
+ while time.time() - start_time < max_wait:
626
+ all_stable = True
627
+
628
+ for node in cluster_nodes:
629
+ if node.status == 'Booted':
630
+ # Check that server is reachable via ping
631
+ if not _ping_server_standalone(node.ipv4):
632
+ all_stable = False
633
+ break
634
+
635
+ # Do not check SSH here; handled by provisioner.wait_for_ssh().
636
+
637
+ if all_stable:
638
+ # Safety sleep to allow for late reboots
639
+ time.sleep(1)
640
+ return True
641
+
642
+ time.sleep(1)
643
+
644
+ return False
645
+
646
+
647
+ def _ping_server_standalone(server_ip: str) -> bool:
648
+ """Check that server is reachable via ping (standalone version)."""
649
+ try:
650
+ result = subprocess.run(['ping', '-c', '1', '-W', '5', server_ip],
651
+ capture_output=True,
652
+ timeout=10,
653
+ check=False)
654
+ return result.returncode == 0
655
+ except Exception as e: # pylint: disable=broad-except
656
+ logger.error(f'Error pinging {server_ip}: {e}')
657
+ return False
658
+
659
+
660
+ def _check_ssh_ready_standalone(server_ip: str) -> bool:
661
+ """Check that SSH is available on the server (standalone version)."""
662
+ try:
663
+ private_key_path, _ = auth.get_or_generate_keys()
664
+ private_key_path = os.path.expanduser(private_key_path)
665
+ ssh_user = 'ecuser'
666
+ result = subprocess.run([
667
+ 'ssh', '-o', 'ConnectTimeout=10', '-o', 'StrictHostKeyChecking=no',
668
+ '-o', f'UserKnownHostsFile={os.devnull}', '-o',
669
+ f'GlobalKnownHostsFile={os.devnull}', '-o', 'IdentitiesOnly=yes',
670
+ '-i', private_key_path, f'{ssh_user}@{server_ip}',
671
+ 'echo "SSH ready"'
672
+ ],
673
+ capture_output=True,
674
+ timeout=15,
675
+ check=False)
676
+ return result.returncode == 0
677
+ except Exception: # pylint: disable=broad-except
678
+ return False
679
+
680
+
681
+ def query_instances(
682
+ cluster_name: str,
683
+ cluster_name_on_cloud: str,
684
+ provider_config: Optional[Dict[str, Any]] = None,
685
+ non_terminated_only: bool = True,
686
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
687
+ """Query instances status for Seeweb cluster."""
688
+ del cluster_name # unused
689
+ # Use the provided provider_config or default to empty dict
690
+ if provider_config is None:
691
+ provider_config = {}
692
+
693
+ # Convert Dict to ProvisionConfig for SeewebNodeProvider
694
+ config = common.ProvisionConfig(
695
+ provider_config=provider_config,
696
+ authentication_config={},
697
+ docker_config={},
698
+ node_config=provider_config,
699
+ count=1, # Not used for query operation
700
+ tags={},
701
+ resume_stopped_nodes=False,
702
+ ports_to_open_on_launch=None,
703
+ )
704
+ provider = SeewebNodeProvider(config, cluster_name_on_cloud)
705
+ seeweb_instances = provider.query_instances()
706
+
707
+ # Map Seeweb status to SkyPilot status
708
+ status_map = {
709
+ 'Booted':
710
+ status_lib.ClusterStatus.UP, # Seeweb uses "Booted" for running
711
+ 'RUNNING': status_lib.ClusterStatus.UP, # All caps version
712
+ 'Booting': status_lib.ClusterStatus.INIT,
713
+ 'PoweringOn': status_lib.ClusterStatus.INIT,
714
+ 'Off': status_lib.ClusterStatus.STOPPED,
715
+ 'Stopped': status_lib.ClusterStatus.STOPPED,
716
+ 'SHUTOFF':
717
+ status_lib.ClusterStatus.STOPPED, # Add missing SHUTOFF status
718
+ 'PoweringOff': status_lib.ClusterStatus.
719
+ STOPPED, # Fixed: should be STOPPED, not INIT
720
+ }
721
+
722
+ result: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
723
+ Optional[str]]] = {}
724
+ for name, seeweb_status in seeweb_instances.items():
725
+ if non_terminated_only and seeweb_status in ('Terminated', 'Deleted'):
726
+ continue
727
+ mapped_status = status_map.get(seeweb_status,
728
+ status_lib.ClusterStatus.INIT)
729
+ # Return tuple of (status, reason) where reason is None for Seeweb
730
+ result[name] = (mapped_status, None)
731
+
732
+ return result
733
+
734
+
735
+ # Signature should not include provider_name; router strips it before calling
736
+ def get_cluster_info(
737
+ region: str,
738
+ cluster_name_on_cloud: str,
739
+ provider_config: Optional[Dict[str, Any]] = None,
740
+ ) -> 'ClusterInfo':
741
+ del region # unused
742
+ # Use Seeweb client to get cluster instances
743
+ client = _get_seeweb_client()
744
+ cluster_nodes = [
745
+ s for s in client.fetch_servers()
746
+ if s.notes and s.notes.startswith(cluster_name_on_cloud)
747
+ ]
748
+
749
+ if not cluster_nodes:
750
+ raise RuntimeError(
751
+ f'No instances found for cluster {cluster_name_on_cloud}')
752
+
753
+ instances = {}
754
+ head_instance = None
755
+ for node in cluster_nodes:
756
+ if getattr(node, 'notes', None) == f'{cluster_name_on_cloud}-head':
757
+ head_instance = node.name
758
+ break
759
+ if head_instance is None:
760
+ head_instance = cluster_nodes[0].name
761
+
762
+ for node in cluster_nodes:
763
+ # For Seeweb, we take the first node as head
764
+ if head_instance is None:
765
+ head_instance = node.name
766
+
767
+ # Get server IP (Seeweb uses 'ipv4' attribute)
768
+ external_ip = node.ipv4
769
+ internal_ip = external_ip # For Seeweb, internal IP = external IP
770
+
771
+ instances[node.name] = [
772
+ InstanceInfo(
773
+ instance_id=node.name,
774
+ internal_ip=internal_ip,
775
+ external_ip=external_ip,
776
+ ssh_port=22,
777
+ tags={},
778
+ )
779
+ ]
780
+
781
+ return ClusterInfo(
782
+ instances=instances,
783
+ head_instance_id=head_instance,
784
+ provider_name='Seeweb',
785
+ provider_config=provider_config,
786
+ )
787
+
788
+
789
+ def open_ports(
790
+ cluster_name_on_cloud: str,
791
+ ports: List[str],
792
+ provider_config: Optional[Dict[str, Any]] = None,
793
+ ) -> None:
794
+ del provider_config # Unused
795
+ logger.debug(f'Seeweb: skipping open_ports for {cluster_name_on_cloud}'
796
+ f'ports={ports} all ports are open by default')
797
+ return
798
+
799
+
800
+ def cleanup_ports(
801
+ cluster_name_on_cloud: str,
802
+ ports: List[str],
803
+ provider_config: Optional[Dict[str, Any]] = None,
804
+ ) -> None:
805
+ del cluster_name_on_cloud, ports, provider_config # Unused.
806
+ return