skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -16,7 +16,7 @@ import re
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import Any, Dict, List, Optional, Set, Tuple
19
+ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
20
20
  import uuid
21
21
 
22
22
  import sqlalchemy
@@ -1020,8 +1020,46 @@ async def cluster_event_retention_daemon():
1020
1020
  await asyncio.sleep(sleep_amount)
1021
1021
 
1022
1022
 
1023
- def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
1024
- event_type: ClusterEventType) -> List[str]:
1023
+ @typing.overload
1024
+ def get_cluster_events(
1025
+ cluster_name: Optional[str],
1026
+ cluster_hash: Optional[str],
1027
+ event_type: ClusterEventType,
1028
+ include_timestamps: Literal[False],
1029
+ limit: Optional[int] = ...,
1030
+ ) -> List[str]:
1031
+ ...
1032
+
1033
+
1034
+ @typing.overload
1035
+ def get_cluster_events(
1036
+ cluster_name: Optional[str],
1037
+ cluster_hash: Optional[str],
1038
+ event_type: ClusterEventType,
1039
+ include_timestamps: Literal[True],
1040
+ limit: Optional[int] = ...,
1041
+ ) -> List[Dict[str, Union[str, int]]]:
1042
+ ...
1043
+
1044
+
1045
+ @typing.overload
1046
+ def get_cluster_events(
1047
+ cluster_name: Optional[str],
1048
+ cluster_hash: Optional[str],
1049
+ event_type: ClusterEventType,
1050
+ include_timestamps: bool = ...,
1051
+ limit: Optional[int] = ...,
1052
+ ) -> Union[List[str], List[Dict[str, Union[str, int]]]]:
1053
+ ...
1054
+
1055
+
1056
+ def get_cluster_events(
1057
+ cluster_name: Optional[str],
1058
+ cluster_hash: Optional[str],
1059
+ event_type: ClusterEventType,
1060
+ include_timestamps: bool = False,
1061
+ limit: Optional[int] = None
1062
+ ) -> Union[List[str], List[Dict[str, Union[str, int]]]]:
1025
1063
  """Returns the cluster events for the cluster.
1026
1064
 
1027
1065
  Args:
@@ -1030,22 +1068,44 @@ def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
1030
1068
  cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
1031
1069
  is specified.
1032
1070
  event_type: Type of the event.
1071
+ include_timestamps: If True, returns list of dicts with 'reason' and
1072
+ 'transitioned_at' fields. If False, returns list of reason strings.
1073
+ limit: If specified, returns at most this many events (most recent).
1074
+ If None, returns all events.
1075
+
1076
+ Returns:
1077
+ If include_timestamps is False: List of reason strings.
1078
+ If include_timestamps is True: List of dicts with 'reason' and
1079
+ 'transitioned_at' (unix timestamp) fields.
1080
+ Events are ordered from oldest to newest.
1033
1081
  """
1034
1082
  assert _SQLALCHEMY_ENGINE is not None
1035
1083
 
1036
- if cluster_name is not None and cluster_hash is not None:
1037
- raise ValueError('Cannot specify both cluster_name and cluster_hash')
1038
- if cluster_name is None and cluster_hash is None:
1039
- raise ValueError('Must specify either cluster_name or cluster_hash')
1040
- if cluster_name is not None:
1041
- cluster_hash = _get_hash_for_existing_cluster(cluster_name)
1042
- if cluster_hash is None:
1043
- raise ValueError(f'Hash for cluster {cluster_name} not found.')
1044
-
1045
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1046
- rows = session.query(cluster_event_table).filter_by(
1047
- cluster_hash=cluster_hash, type=event_type.value).order_by(
1048
- cluster_event_table.c.transitioned_at.asc()).all()
1084
+ cluster_hash = _resolve_cluster_hash(cluster_hash, cluster_name)
1085
+ if cluster_hash is None:
1086
+ raise ValueError(f'Hash for cluster {cluster_name} not found.')
1087
+
1088
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1089
+ if limit is not None:
1090
+ # To get the most recent N events in ASC order, we use a subquery:
1091
+ # 1. Get most recent N events (ORDER BY DESC LIMIT N)
1092
+ # 2. Re-order them by ASC
1093
+ subquery = session.query(cluster_event_table).filter_by(
1094
+ cluster_hash=cluster_hash, type=event_type.value).order_by(
1095
+ cluster_event_table.c.transitioned_at.desc()).limit(
1096
+ limit).subquery()
1097
+ rows = session.query(subquery).order_by(
1098
+ subquery.c.transitioned_at.asc()).all()
1099
+ else:
1100
+ rows = session.query(cluster_event_table).filter_by(
1101
+ cluster_hash=cluster_hash, type=event_type.value).order_by(
1102
+ cluster_event_table.c.transitioned_at.asc()).all()
1103
+
1104
+ if include_timestamps:
1105
+ return [{
1106
+ 'reason': row.reason,
1107
+ 'transitioned_at': row.transitioned_at
1108
+ } for row in rows]
1049
1109
  return [row.reason for row in rows]
1050
1110
 
1051
1111
 
@@ -1537,6 +1597,38 @@ def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
1537
1597
  return row.cluster_hash
1538
1598
 
1539
1599
 
1600
+ def _resolve_cluster_hash(cluster_hash: Optional[str] = None,
1601
+ cluster_name: Optional[str] = None) -> Optional[str]:
1602
+ """Resolve cluster_hash from either cluster_hash or cluster_name.
1603
+
1604
+ Validates that exactly one of cluster_hash or cluster_name is provided,
1605
+ then resolves cluster_name to cluster_hash if needed.
1606
+
1607
+ Args:
1608
+ cluster_hash: Direct cluster hash, if known.
1609
+ cluster_name: Cluster name to resolve to hash.
1610
+
1611
+ Returns:
1612
+ The cluster_hash string, or None if cluster_name was provided but
1613
+ the cluster doesn't exist.
1614
+
1615
+ Raises:
1616
+ ValueError: If both or neither of cluster_hash/cluster_name are
1617
+ provided.
1618
+ """
1619
+ if cluster_hash is not None and cluster_name is not None:
1620
+ raise ValueError(f'Cannot specify both cluster_hash ({cluster_hash}) '
1621
+ f'and cluster_name ({cluster_name})')
1622
+
1623
+ if cluster_hash is None and cluster_name is None:
1624
+ raise ValueError('Must specify either cluster_hash or cluster_name')
1625
+
1626
+ if cluster_name is not None:
1627
+ return _get_hash_for_existing_cluster(cluster_name)
1628
+
1629
+ return cluster_hash
1630
+
1631
+
1540
1632
  @_init_db
1541
1633
  @metrics_lib.time_me
1542
1634
  def get_launched_resources_from_cluster_hash(
sky/jobs/client/sdk.py CHANGED
@@ -7,6 +7,7 @@ import click
7
7
 
8
8
  from sky import sky_logging
9
9
  from sky.adaptors import common as adaptors_common
10
+ from sky.backends import backend_utils
10
11
  from sky.client import common as client_common
11
12
  from sky.client import sdk
12
13
  from sky.schemas.api import responses
@@ -100,9 +101,13 @@ def launch(
100
101
  pool_statuses = sdk.get(pool_status_request_id)
101
102
  if not pool_statuses:
102
103
  raise click.UsageError(f'Pool {pool!r} not found.')
103
- resources = pool_statuses[0]['requested_resources_str']
104
- click.secho(f'Use resources from pool {pool!r}: {resources}.',
105
- fg='green')
104
+ # Show the job's requested resources, not the pool worker
105
+ # resources
106
+ job_resources_str = backend_utils.get_task_resources_str(
107
+ dag.tasks[0], is_managed_job=True)
108
+ click.secho(
109
+ f'Use resources from pool {pool!r}: {job_resources_str}.',
110
+ fg='green')
106
111
  if num_jobs is not None:
107
112
  job_identity = f'{num_jobs} managed jobs'
108
113
  prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
sky/jobs/controller.py CHANGED
@@ -2,6 +2,7 @@
2
2
  """
3
3
  import asyncio
4
4
  import io
5
+ import json
5
6
  import os
6
7
  import pathlib
7
8
  import resource
@@ -11,7 +12,7 @@ import threading
11
12
  import time
12
13
  import traceback
13
14
  import typing
14
- from typing import Dict, Optional, Set
15
+ from typing import Dict, List, Optional, Set
15
16
 
16
17
  import dotenv
17
18
 
@@ -31,6 +32,7 @@ from sky.jobs import recovery_strategy
31
32
  from sky.jobs import scheduler
32
33
  from sky.jobs import state as managed_job_state
33
34
  from sky.jobs import utils as managed_job_utils
35
+ from sky.server import plugins
34
36
  from sky.skylet import constants
35
37
  from sky.skylet import job_lib
36
38
  from sky.usage import usage_lib
@@ -43,11 +45,16 @@ from sky.utils import controller_utils
43
45
  from sky.utils import dag_utils
44
46
  from sky.utils import status_lib
45
47
  from sky.utils import ux_utils
48
+ from sky.utils.plugin_extensions import ExternalClusterFailure
49
+ from sky.utils.plugin_extensions import ExternalFailureSource
46
50
 
47
51
  if typing.TYPE_CHECKING:
48
52
  import psutil
53
+
54
+ from sky.schemas.generated import jobsv1_pb2
49
55
  else:
50
56
  psutil = adaptors_common.LazyImport('psutil')
57
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
51
58
 
52
59
  logger = sky_logging.init_logger('sky.jobs.controller')
53
60
 
@@ -236,6 +243,64 @@ class JobController:
236
243
  await context_utils.to_thread(managed_job_utils.terminate_cluster,
237
244
  cluster_name)
238
245
 
246
+ async def _get_job_exit_codes(
247
+ self, job_id: Optional[int],
248
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
249
+ ) -> Optional[list]:
250
+ """Retrieve exit codes from the remote cluster.
251
+
252
+ Args:
253
+ job_id: The job ID on the remote cluster.
254
+ handle: The handle to the cluster.
255
+
256
+ Returns:
257
+ List of exit codes, or None if not available.
258
+ """
259
+ try:
260
+ use_legacy = not handle.is_grpc_enabled_with_flag
261
+
262
+ if not use_legacy:
263
+ try:
264
+ request = jobsv1_pb2.GetJobExitCodesRequest()
265
+ if job_id is not None:
266
+ request.job_id = job_id
267
+
268
+ response = await context_utils.to_thread(
269
+ backend_utils.invoke_skylet_with_retries,
270
+ lambda: cloud_vm_ray_backend.SkyletClient(
271
+ handle.get_grpc_channel()).get_job_exit_codes(
272
+ request))
273
+
274
+ exit_codes = list(
275
+ response.exit_codes) if response.exit_codes else None
276
+ return exit_codes
277
+ except exceptions.SkyletMethodNotImplementedError:
278
+ # Fall back to legacy if RPC not implemented
279
+ use_legacy = True
280
+
281
+ if use_legacy:
282
+ # Use existing SSH-based code generation
283
+ code = job_lib.JobLibCodeGen.get_job_exit_codes(job_id)
284
+
285
+ returncode, stdout, stderr = await context_utils.to_thread(
286
+ self._backend.run_on_head,
287
+ handle,
288
+ code,
289
+ stream_logs=False,
290
+ require_outputs=True,
291
+ separate_stderr=True)
292
+
293
+ if returncode != 0:
294
+ logger.debug(f'Failed to retrieve exit codes: {stderr}')
295
+ return None
296
+
297
+ exit_codes = json.loads(stdout.strip())
298
+ return exit_codes
299
+ except Exception as e: # pylint: disable=broad-except
300
+ logger.debug(f'Failed to retrieve job exit codes: {e}')
301
+ return None
302
+ return None
303
+
239
304
  async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
240
305
  """Busy loop monitoring cluster status and handling recovery.
241
306
 
@@ -334,6 +399,12 @@ class JobController:
334
399
  resources_str = backend_utils.get_task_resources_str(
335
400
  task, is_managed_job=True)
336
401
 
402
+ # Get full_resources_json using get_resource_config which handles
403
+ # heterogeneous resource configurations (any_of/ordered).
404
+ full_resources_json = None
405
+ if task.resources:
406
+ full_resources_json = task.get_resource_config()
407
+
337
408
  await managed_job_state.set_starting_async(
338
409
  self._job_id,
339
410
  task_id,
@@ -342,9 +413,12 @@ class JobController:
342
413
  resources_str=resources_str,
343
414
  specs={
344
415
  'max_restarts_on_errors':
345
- self._strategy_executor.max_restarts_on_errors
416
+ self._strategy_executor.max_restarts_on_errors,
417
+ 'recover_on_exit_codes':
418
+ self._strategy_executor.recover_on_exit_codes
346
419
  },
347
- callback_func=callback_func)
420
+ callback_func=callback_func,
421
+ full_resources_json=full_resources_json)
348
422
  logger.info(f'Submitted managed job {self._job_id} '
349
423
  f'(task: {task_id}, name: {task.name!r}); '
350
424
  f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
@@ -365,9 +439,8 @@ class JobController:
365
439
  launch_time = time.time() - launch_start
366
440
  logger.info(f'Cluster launch completed in {launch_time:.2f}s')
367
441
  assert remote_job_submitted_at is not None, remote_job_submitted_at
368
- if self._pool is None:
369
- job_id_on_pool_cluster = None
370
- else:
442
+ job_id_on_pool_cluster: Optional[int] = None
443
+ if self._pool:
371
444
  # Update the cluster name when using pool.
372
445
  cluster_name, job_id_on_pool_cluster = (
373
446
  await
@@ -411,6 +484,8 @@ class JobController:
411
484
  except KeyError:
412
485
  pass
413
486
 
487
+ transient_job_check_error_start_time = None
488
+ job_check_backoff = None
414
489
  while True:
415
490
  status_check_count += 1
416
491
 
@@ -462,19 +537,38 @@ class JobController:
462
537
  # recovering, we will set the job status to None, which will force
463
538
  # enter the recovering logic.
464
539
  job_status = None
540
+ transient_job_check_error_reason = None
465
541
  if not force_transit_to_recovering:
466
542
  try:
467
- job_status = await managed_job_utils.get_job_status(
468
- self._backend,
469
- cluster_name,
470
- job_id=job_id_on_pool_cluster,
471
- )
543
+ job_status, transient_job_check_error_reason = await (
544
+ managed_job_utils.get_job_status(
545
+ self._backend,
546
+ cluster_name,
547
+ job_id=job_id_on_pool_cluster,
548
+ ))
472
549
  except exceptions.FetchClusterInfoError as fetch_e:
473
550
  logger.info(
474
551
  'Failed to fetch the job status. Start recovery.\n'
475
552
  f'Exception: {common_utils.format_exception(fetch_e)}\n'
476
553
  f'Traceback: {traceback.format_exc()}')
477
554
 
555
+ # When job status check fails, we need to retry to avoid false alarm
556
+ # for job failure, as it could be a transient error for
557
+ # communication issue.
558
+ if transient_job_check_error_reason is not None:
559
+ logger.info(
560
+ 'Potential transient error when fetching the job '
561
+ f'status. Reason: {transient_job_check_error_reason}.\n'
562
+ 'Check cluster status to determine if the job is '
563
+ 'preempted or failed.')
564
+ if transient_job_check_error_start_time is None:
565
+ transient_job_check_error_start_time = time.time()
566
+ job_check_backoff = common_utils.Backoff(
567
+ initial_backoff=1, max_backoff_factor=5)
568
+ else:
569
+ transient_job_check_error_start_time = None
570
+ job_check_backoff = None
571
+
478
572
  if job_status == job_lib.JobStatus.SUCCEEDED:
479
573
  logger.info(f'Task {task_id} succeeded! '
480
574
  'Getting end time and cleaning up')
@@ -550,15 +644,16 @@ class JobController:
550
644
 
551
645
  # Pull the actual cluster status from the cloud provider to
552
646
  # determine whether the cluster is preempted or failed.
553
- # TODO(zhwu): For hardware failure, such as GPU failure, it may not
554
- # be reflected in the cluster status, depending on the cloud, which
555
- # can also cause failure of the job, and we need to recover it
556
- # rather than fail immediately.
557
- (cluster_status,
558
- handle) = backend_utils.refresh_cluster_status_handle(
559
- cluster_name,
560
- force_refresh_statuses=set(status_lib.ClusterStatus))
561
-
647
+ # NOTE: Some failures may not be reflected in the cluster status
648
+ # depending on the cloud, which can also cause failure of the job.
649
+ # Plugins can report such failures via ExternalFailureSource.
650
+ # TODO(cooperc): do we need to add this to asyncio thread?
651
+ (cluster_status, handle) = await context_utils.to_thread(
652
+ backend_utils.refresh_cluster_status_handle,
653
+ cluster_name,
654
+ force_refresh_statuses=set(status_lib.ClusterStatus))
655
+
656
+ external_failures: Optional[List[ExternalClusterFailure]] = None
562
657
  if cluster_status != status_lib.ClusterStatus.UP:
563
658
  # The cluster is (partially) preempted or failed. It can be
564
659
  # down, INIT or STOPPED, based on the interruption behavior of
@@ -569,6 +664,15 @@ class JobController:
569
664
  logger.info(
570
665
  f'Cluster is preempted or failed{cluster_status_str}. '
571
666
  'Recovering...')
667
+ if ExternalFailureSource.is_registered():
668
+ cluster_failures = await context_utils.to_thread(
669
+ ExternalFailureSource.get, cluster_name=cluster_name)
670
+ if cluster_failures:
671
+ logger.info(
672
+ f'Detected cluster failures: {cluster_failures}')
673
+ external_failures = (
674
+ ExternalClusterFailure.from_failure_list(
675
+ cluster_failures))
572
676
  else:
573
677
  if job_status is not None and not job_status.is_terminal():
574
678
  # The multi-node job is still running, continue monitoring.
@@ -612,18 +716,37 @@ class JobController:
612
716
  'can be caused by the job taking too much memory '
613
717
  'or other resources. Try adding more memory, CPU, '
614
718
  f'or disk in your job definition. {failure_reason}')
719
+
720
+ # Retrieve exit codes from the failed job
721
+ exit_codes = await self._get_job_exit_codes(
722
+ job_id_on_pool_cluster, handle)
723
+
615
724
  should_restart_on_failure = (
616
- self._strategy_executor.should_restart_on_failure())
725
+ self._strategy_executor.should_restart_on_failure(
726
+ exit_codes=exit_codes))
617
727
  if should_restart_on_failure:
618
728
  max_restarts = (
619
729
  self._strategy_executor.max_restarts_on_errors)
620
- logger.info(
621
- f'User program crashed '
622
- f'({managed_job_status.value}). '
623
- f'Retry the job as max_restarts_on_errors is '
624
- f'set to {max_restarts}. '
730
+ exit_code_msg = (
731
+ '(Retry the job as '
732
+ f'max_restarts_on_errors is set to {max_restarts}. '
625
733
  f'[{self._strategy_executor.restart_cnt_on_failure}'
626
- f'/{max_restarts}]')
734
+ f'/{max_restarts}])')
735
+ if (exit_codes and
736
+ self._strategy_executor.recover_on_exit_codes):
737
+ recover_codes = (
738
+ self._strategy_executor.recover_on_exit_codes)
739
+ matching_codes = [
740
+ c for c in exit_codes if c in recover_codes
741
+ ]
742
+ if matching_codes:
743
+ exit_code_msg = (
744
+ f'(Exit code(s) {matching_codes} matched '
745
+ 'recover_on_exit_codes '
746
+ f'[{recover_codes}])')
747
+ logger.info(
748
+ 'User program crashed '
749
+ f'({managed_job_status.value}). {exit_code_msg}')
627
750
  else:
628
751
  logger.info(
629
752
  f'Task {task_id} failed and will not be retried')
@@ -655,9 +778,42 @@ class JobController:
655
778
  # job status. Try to recover the job (will not restart the
656
779
  # cluster, if the cluster is healthy).
657
780
  assert job_status is None, job_status
658
- logger.info('Failed to fetch the job status while the '
659
- 'cluster is healthy. Try to recover the job '
660
- '(the cluster will not be restarted).')
781
+ if transient_job_check_error_reason is not None:
782
+ assert (transient_job_check_error_start_time
783
+ is not None), (
784
+ transient_job_check_error_start_time,
785
+ transient_job_check_error_reason)
786
+ assert job_check_backoff is not None, (
787
+ job_check_backoff, transient_job_check_error_reason)
788
+ elapsed = time.time(
789
+ ) - transient_job_check_error_start_time
790
+ if (elapsed < managed_job_utils.
791
+ JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS):
792
+ remaining_timeout = (
793
+ managed_job_utils.
794
+ JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS -
795
+ elapsed)
796
+ backoff_time = min(
797
+ job_check_backoff.current_backoff(),
798
+ remaining_timeout)
799
+ logger.info(
800
+ 'Failed to fetch the job status while the '
801
+ 'cluster is healthy. Retrying to avoid false'
802
+ 'alarm for job failure. Retrying in '
803
+ f'{backoff_time:.1f} seconds...')
804
+ await asyncio.sleep(backoff_time)
805
+ continue
806
+ else:
807
+ logger.info(
808
+ 'Failed to fetch the job status after retrying '
809
+ f'for {elapsed:.1f} seconds. Try to recover '
810
+ 'the job by restarting the job/cluster.')
811
+ else:
812
+ logger.info(
813
+ 'Failed to fetch the job status due to '
814
+ 'unrecoverable error. Try to recover the job by'
815
+ ' restarting the job/cluster.')
816
+
661
817
  # When the handle is None, the cluster should be cleaned up already.
662
818
  if handle is not None:
663
819
  resources = handle.launched_resources
@@ -688,7 +844,9 @@ class JobController:
688
844
  job_id=self._job_id,
689
845
  task_id=task_id,
690
846
  force_transit_to_recovering=force_transit_to_recovering,
691
- callback_func=callback_func)
847
+ callback_func=callback_func,
848
+ external_failures=external_failures,
849
+ )
692
850
 
693
851
  recovered_time = await self._strategy_executor.recover()
694
852
 
@@ -1183,6 +1341,8 @@ async def main(controller_uuid: str):
1183
1341
 
1184
1342
  context_utils.hijack_sys_attrs()
1185
1343
 
1344
+ plugins.load_plugins(plugins.ExtensionContext())
1345
+
1186
1346
  controller = ControllerManager(controller_uuid)
1187
1347
 
1188
1348
  # Will happen multiple times, who cares though