skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py CHANGED
@@ -66,6 +66,7 @@ class JobInfoLoc(enum.IntEnum):
66
66
  PID = 9
67
67
  LOG_PATH = 10
68
68
  METADATA = 11
69
+ EXIT_CODES = 12
69
70
 
70
71
 
71
72
  def create_table(cursor, conn):
@@ -124,6 +125,8 @@ def create_table(cursor, conn):
124
125
  'metadata',
125
126
  'TEXT DEFAULT \'{}\'',
126
127
  value_to_replace_existing_entries='{}')
128
+ db_utils.add_column_to_table(cursor, conn, 'jobs', 'exit_codes',
129
+ 'TEXT DEFAULT NULL')
127
130
  conn.commit()
128
131
 
129
132
 
@@ -388,10 +391,16 @@ def add_job(job_name: str,
388
391
  assert _DB is not None
389
392
  job_submitted_at = time.time()
390
393
  # job_id will autoincrement with the null value
391
- _DB.cursor.execute(
392
- 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
393
- (job_name, username, job_submitted_at, JobStatus.INIT.value,
394
- run_timestamp, None, resources_str, metadata))
394
+ if int(constants.SKYLET_VERSION) >= 28:
395
+ _DB.cursor.execute(
396
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?, null)', # pylint: disable=line-too-long
397
+ (job_name, username, job_submitted_at, JobStatus.INIT.value,
398
+ run_timestamp, None, resources_str, metadata))
399
+ else:
400
+ _DB.cursor.execute(
401
+ 'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)', # pylint: disable=line-too-long
402
+ (job_name, username, job_submitted_at, JobStatus.INIT.value,
403
+ run_timestamp, None, resources_str, metadata))
395
404
  _DB.conn.commit()
396
405
  rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
397
406
  (run_timestamp,))
@@ -468,6 +477,41 @@ def set_status(job_id: int, status: JobStatus) -> None:
468
477
  _set_status_no_lock(job_id, status)
469
478
 
470
479
 
480
+ @init_db
481
+ def set_exit_codes(job_id: int, exit_codes: List[int]) -> None:
482
+ """Set exit codes for a job as comma-separated string.
483
+
484
+ Args:
485
+ job_id: The job ID to update.
486
+ exit_codes: A list of exit codes to store.
487
+ """
488
+ assert _DB is not None
489
+ exit_codes_str = ','.join(str(code) for code in exit_codes)
490
+ with filelock.FileLock(_get_lock_path(job_id)):
491
+ _DB.cursor.execute('UPDATE jobs SET exit_codes=(?) WHERE job_id=(?)',
492
+ (exit_codes_str, job_id))
493
+ _DB.conn.commit()
494
+
495
+
496
+ @init_db
497
+ def get_exit_codes(job_id: int) -> Optional[List[int]]:
498
+ """Get exit codes for a job from comma-separated string.
499
+
500
+ Args:
501
+ job_id: The job ID to retrieve exit codes for.
502
+
503
+ Returns:
504
+ A list of exit codes, or None if not found.
505
+ """
506
+ assert _DB is not None
507
+ rows = _DB.cursor.execute('SELECT exit_codes FROM jobs WHERE job_id=(?)',
508
+ (job_id,))
509
+ row = rows.fetchone()
510
+ if row is None or row[0] is None:
511
+ return None
512
+ return [int(code) for code in row[0].split(',')]
513
+
514
+
471
515
  @init_db
472
516
  def set_job_started(job_id: int) -> None:
473
517
  # TODO(mraheja): remove pylint disabling when filelock version updated.
@@ -506,6 +550,20 @@ def get_status(job_id: int) -> Optional[JobStatus]:
506
550
  return get_status_no_lock(job_id)
507
551
 
508
552
 
553
+ def wait_for_job_completion(job_id: int, poll_interval: float = 1.0) -> None:
554
+ """Wait for a job to reach a terminal state.
555
+
556
+ Args:
557
+ job_id: The job ID to wait for.
558
+ poll_interval: How often to poll the job status in seconds.
559
+ """
560
+ while True:
561
+ status = get_status(job_id)
562
+ if status is None or status.is_terminal():
563
+ break
564
+ time.sleep(poll_interval)
565
+
566
+
509
567
  @init_db
510
568
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
511
569
  return message_utils.encode_payload(get_statuses(job_ids))
@@ -674,6 +732,14 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
674
732
  'pid': row[JobInfoLoc.PID.value],
675
733
  'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
676
734
  })
735
+ if int(constants.SKYLET_VERSION) >= 28:
736
+ exit_code_str = row[JobInfoLoc.EXIT_CODES.value]
737
+ if not isinstance(exit_code_str, str):
738
+ records[-1]['exit_codes'] = None
739
+ else:
740
+ records[-1]['exit_codes'] = ([
741
+ int(code) for code in exit_code_str.split(',')
742
+ ])
677
743
  return records
678
744
 
679
745
 
@@ -1152,6 +1218,15 @@ class JobLibCodeGen:
1152
1218
  ]
1153
1219
  return cls._build(code)
1154
1220
 
1221
+ @classmethod
1222
+ def wait_for_job(cls, job_id: int) -> str:
1223
+ code = [
1224
+ # TODO(kevin): backward compatibility, remove in 0.13.0.
1225
+ (f'job_lib.wait_for_job_completion({job_id!r}) if '
1226
+ 'hasattr(job_lib, "wait_for_job_completion") else None'),
1227
+ ]
1228
+ return cls._build(code)
1229
+
1155
1230
  @classmethod
1156
1231
  def update_status(cls) -> str:
1157
1232
  code = ['job_lib.update_status()']
@@ -1269,8 +1344,19 @@ class JobLibCodeGen:
1269
1344
  ]
1270
1345
  return cls._build(code)
1271
1346
 
1347
+ @classmethod
1348
+ def get_job_exit_codes(cls, job_id: Optional[int] = None) -> str:
1349
+ """Generate shell command to retrieve exit codes."""
1350
+ code = [
1351
+ f'job_id = {job_id} if {job_id} is not None else job_lib.get_latest_job_id()', # pylint: disable=line-too-long
1352
+ 'exit_codes = job_lib.get_exit_codes(job_id) if job_id is not None and int(constants.SKYLET_VERSION) >= 28 else {}', # pylint: disable=line-too-long
1353
+ 'print(exit_codes, flush=True)',
1354
+ ]
1355
+ return cls._build(code)
1356
+
1272
1357
  @classmethod
1273
1358
  def _build(cls, code: List[str]) -> str:
1274
1359
  code = cls._PREFIX + code
1275
1360
  code = ';'.join(code)
1276
- return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}'
1361
+ return (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
1362
+ f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}')
sky/skylet/log_lib.py CHANGED
@@ -172,7 +172,7 @@ def run_with_log(
172
172
  streaming_prefix: Optional[str] = None,
173
173
  log_cmd: bool = False,
174
174
  **kwargs,
175
- ) -> Union[int, Tuple[int, str, str]]:
175
+ ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
176
176
  """Runs a command and logs its output to a file.
177
177
 
178
178
  Args:
@@ -183,6 +183,8 @@ def run_with_log(
183
183
  process_stream: Whether to post-process the stdout/stderr of the
184
184
  command, such as replacing or skipping lines on the fly. If
185
185
  enabled, lines are printed only when '\r' or '\n' is found.
186
+ streaming_prefix: Optional prefix for each log line. Can contain {pid}
187
+ placeholder which will be replaced with the subprocess PID.
186
188
 
187
189
  Returns the returncode or returncode, stdout and stderr of the command.
188
190
  Note that the stdout and stderr is already decoded.
@@ -228,6 +230,13 @@ def run_with_log(
228
230
  # For backward compatibility, do not specify use_kill_pg by
229
231
  # default.
230
232
  subprocess_utils.kill_process_daemon(proc.pid)
233
+
234
+ # Format streaming_prefix with subprocess PID if it contains {pid}
235
+ formatted_streaming_prefix = streaming_prefix
236
+ if streaming_prefix and '{pid}' in streaming_prefix:
237
+ formatted_streaming_prefix = streaming_prefix.format(
238
+ pid=proc.pid)
239
+
231
240
  stdout = ''
232
241
  stderr = ''
233
242
  stdout_stream_handler = None
@@ -256,7 +265,7 @@ def run_with_log(
256
265
  line_processor=line_processor,
257
266
  # Replace CRLF when the output is logged to driver by ray.
258
267
  replace_crlf=with_ray,
259
- streaming_prefix=streaming_prefix,
268
+ streaming_prefix=formatted_streaming_prefix,
260
269
  )
261
270
  stdout_stream_handler = functools.partial(
262
271
  _handle_io_stream,
@@ -349,7 +358,8 @@ def run_bash_command_with_log(bash_command: str,
349
358
  log_path: str,
350
359
  env_vars: Optional[Dict[str, str]] = None,
351
360
  stream_logs: bool = False,
352
- with_ray: bool = False):
361
+ with_ray: bool = False,
362
+ streaming_prefix: Optional[str] = None):
353
363
  with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
354
364
  delete=False) as fp:
355
365
  bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
@@ -364,6 +374,7 @@ def run_bash_command_with_log(bash_command: str,
364
374
  log_path,
365
375
  stream_logs=stream_logs,
366
376
  with_ray=with_ray,
377
+ streaming_prefix=streaming_prefix,
367
378
  shell=True)
368
379
 
369
380
 
@@ -372,9 +383,14 @@ def run_bash_command_with_log_and_return_pid(
372
383
  log_path: str,
373
384
  env_vars: Optional[Dict[str, str]] = None,
374
385
  stream_logs: bool = False,
375
- with_ray: bool = False):
376
- return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
377
- stream_logs, with_ray)
386
+ with_ray: bool = False,
387
+ streaming_prefix: Optional[str] = None):
388
+ return_code = run_bash_command_with_log(bash_command,
389
+ log_path,
390
+ env_vars,
391
+ stream_logs,
392
+ with_ray,
393
+ streaming_prefix=streaming_prefix)
378
394
  return {'return_code': return_code, 'pid': os.getpid()}
379
395
 
380
396
 
sky/skylet/log_lib.pyi CHANGED
@@ -68,7 +68,7 @@ def run_with_log(cmd: Union[List[str], str],
68
68
  process_stream: bool = ...,
69
69
  line_processor: Optional[log_utils.LineProcessor] = ...,
70
70
  streaming_prefix: Optional[str] = ...,
71
- ray_job_id: Optional[str] = ...,
71
+ log_cmd: bool = ...,
72
72
  **kwargs) -> int:
73
73
  ...
74
74
 
@@ -87,7 +87,7 @@ def run_with_log(cmd: Union[List[str], str],
87
87
  process_stream: bool = ...,
88
88
  line_processor: Optional[log_utils.LineProcessor] = ...,
89
89
  streaming_prefix: Optional[str] = ...,
90
- ray_job_id: Optional[str] = ...,
90
+ log_cmd: bool = ...,
91
91
  **kwargs) -> Tuple[int, str, str]:
92
92
  ...
93
93
 
@@ -106,8 +106,8 @@ def run_with_log(cmd: Union[List[str], str],
106
106
  process_stream: bool = ...,
107
107
  line_processor: Optional[log_utils.LineProcessor] = ...,
108
108
  streaming_prefix: Optional[str] = ...,
109
- ray_job_id: Optional[str] = ...,
110
- **kwargs) -> Union[int, Tuple[int, str, str]]:
109
+ log_cmd: bool = ...,
110
+ **kwargs) -> Tuple[int, int]:
111
111
  ...
112
112
 
113
113
 
@@ -125,7 +125,8 @@ def run_bash_command_with_log(bash_command: str,
125
125
  log_path: str,
126
126
  env_vars: Optional[Dict[str, str]] = ...,
127
127
  stream_logs: bool = ...,
128
- with_ray: bool = ...):
128
+ with_ray: bool = ...,
129
+ streaming_prefix: Optional[str] = ...) -> int:
129
130
  ...
130
131
 
131
132
 
@@ -134,7 +135,8 @@ def run_bash_command_with_log_and_return_pid(
134
135
  log_path: str,
135
136
  env_vars: Optional[Dict[str, str]] = ...,
136
137
  stream_logs: bool = ...,
137
- with_ray: bool = ...):
138
+ with_ray: bool = ...,
139
+ streaming_prefix: Optional[str] = ...) -> Dict[str, Union[int, str]]:
138
140
  ...
139
141
 
140
142
 
sky/skylet/services.py CHANGED
@@ -197,12 +197,11 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
197
197
  f.write(request.codegen)
198
198
  os.chmod(script_path, 0o755)
199
199
 
200
- cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
201
200
  job_submit_cmd = (
202
201
  # JOB_CMD_IDENTIFIER is used for identifying the process
203
202
  # retrieved with pid is the same driver process.
204
203
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
205
- f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
204
+ f'{constants.SKY_PYTHON_CMD} -u {script_path}'
206
205
  # Do not use &>, which is not POSIX and may not work.
207
206
  # Note that the order of ">filename 2>&1" matters.
208
207
  f' > {remote_log_path} 2>&1')
@@ -387,6 +386,21 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
387
386
  except Exception as e: # pylint: disable=broad-except
388
387
  context.abort(grpc.StatusCode.INTERNAL, str(e))
389
388
 
389
+ def GetJobExitCodes( # type: ignore[return]
390
+ self, request: jobsv1_pb2.GetJobExitCodesRequest,
391
+ context: grpc.ServicerContext
392
+ ) -> jobsv1_pb2.GetJobExitCodesResponse:
393
+ try:
394
+ job_id = request.job_id if request.HasField(
395
+ 'job_id') else job_lib.get_latest_job_id()
396
+ exit_codes: Optional[List[int]] = None
397
+ if job_id:
398
+ exit_codes_list = job_lib.get_exit_codes(job_id)
399
+ exit_codes = exit_codes_list if exit_codes_list else []
400
+ return jobsv1_pb2.GetJobExitCodesResponse(exit_codes=exit_codes)
401
+ except Exception as e: # pylint: disable=broad-except
402
+ context.abort(grpc.StatusCode.INTERNAL, str(e))
403
+
390
404
 
391
405
  class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
392
406
  ):
@@ -488,7 +502,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
488
502
  entrypoint=job.get('entrypoint'),
489
503
  metadata=converted_metadata,
490
504
  pool=job.get('pool'),
491
- pool_hash=job.get('pool_hash'))
505
+ pool_hash=job.get('pool_hash'),
506
+ links=job.get('links'))
492
507
  jobs_info.append(job_info)
493
508
 
494
509
  return managed_jobsv1_pb2.GetJobTableResponse(
sky/skylet/skylet.py CHANGED
@@ -48,8 +48,12 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
48
48
  # putting it here for visibility.
49
49
  # TODO(kevin): Determine the optimal max number of threads.
50
50
  max_workers = min(32, (os.cpu_count() or 1) + 4)
51
+ # There's only a single skylet process per cluster, so disable
52
+ # SO_REUSEPORT to raise an error if the port is already in use.
53
+ options = (('grpc.so_reuseport', 0),)
51
54
  server = grpc.server(
52
- concurrent.futures.ThreadPoolExecutor(max_workers=max_workers))
55
+ concurrent.futures.ThreadPoolExecutor(max_workers=max_workers),
56
+ options=options)
53
57
 
54
58
  autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
55
59
  services.AutostopServiceImpl(), server)
@@ -110,7 +110,8 @@ def kill_process_tree(process: psutil.Process,
110
110
 
111
111
 
112
112
  def main():
113
- # daemonize()
113
+ daemonize()
114
+
114
115
  parser = argparse.ArgumentParser()
115
116
  parser.add_argument('--parent-pid', type=int, required=True)
116
117
  parser.add_argument('--proc-pid', type=int, required=True)
@@ -0,0 +1,12 @@
1
+ """Constants for SSH Node Pools"""
2
+ # pylint: disable=line-too-long
3
+ import os
4
+
5
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
6
+ SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
7
+ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
8
+ NODE_POOLS_KEY_DIR = os.path.expanduser('~/.sky/ssh_keys')
9
+ DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
10
+
11
+ # TODO (kyuds): make this configurable?
12
+ K3S_TOKEN = 'mytoken' # Any string can be used as the token
@@ -1,10 +1,15 @@
1
1
  """SSH Node Pool management core functionality."""
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Any, Dict, List
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  import yaml
7
7
 
8
+ from sky import clouds
9
+ from sky.ssh_node_pools import constants
10
+ from sky.ssh_node_pools import deploy
11
+ from sky.usage import usage_lib
12
+ from sky.utils import common_utils
8
13
  from sky.utils import yaml_utils
9
14
 
10
15
 
@@ -12,8 +17,8 @@ class SSHNodePoolManager:
12
17
  """Manager for SSH Node Pool configurations."""
13
18
 
14
19
  def __init__(self):
15
- self.config_path = Path.home() / '.sky' / 'ssh_node_pools.yaml'
16
- self.keys_dir = Path.home() / '.sky' / 'ssh_keys'
20
+ self.config_path = Path(constants.DEFAULT_SSH_NODE_POOLS_PATH)
21
+ self.keys_dir = Path(constants.NODE_POOLS_KEY_DIR)
17
22
  self.keys_dir.mkdir(parents=True, exist_ok=True)
18
23
 
19
24
  def get_all_pools(self) -> Dict[str, Any]:
@@ -133,3 +138,35 @@ def list_ssh_keys() -> List[str]:
133
138
  """List available SSH keys."""
134
139
  manager = SSHNodePoolManager()
135
140
  return manager.list_ssh_keys()
141
+
142
+
143
+ @usage_lib.entrypoint
144
+ def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
145
+ """Deploys or tears down a Kubernetes cluster on SSH targets.
146
+
147
+ Args:
148
+ infra: Name of the cluster configuration in ssh_node_pools.yaml.
149
+ If None, the first cluster in the file is used.
150
+ cleanup: If True, clean up the cluster instead of deploying.
151
+ """
152
+ deploy.run(cleanup=cleanup, infra=infra)
153
+
154
+
155
+ @usage_lib.entrypoint
156
+ def ssh_status(context_name: str) -> Tuple[bool, str]:
157
+ """Check the status of an SSH Node Pool context.
158
+
159
+ Args:
160
+ context_name: The SSH context name (e.g., 'ssh-my-cluster')
161
+
162
+ Returns:
163
+ Tuple[bool, str]: (is_ready, reason)
164
+ - is_ready: True if the SSH Node Pool is ready, False otherwise
165
+ - reason: Explanation of the status
166
+ """
167
+ try:
168
+ is_ready, reason = clouds.SSH.check_single_context(context_name)
169
+ return is_ready, reason
170
+ except Exception as e: # pylint: disable=broad-except
171
+ return False, ('Failed to check SSH context: '
172
+ f'{common_utils.format_exception(e)}')
@@ -0,0 +1,4 @@
1
+ """Module for Deploying SSH Node Pools"""
2
+ from sky.ssh_node_pools.deploy.deploy import run
3
+
4
+ __all__ = ['run']