skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -89,6 +89,7 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
89
89
  resumed_instance_ids=[],
90
90
  created_instance_ids=[])
91
91
 
92
+ secure_only = config.provider_config.get('secure_only', False)
92
93
  for _ in range(to_start_count):
93
94
  node_type = 'head' if head_instance_id is None else 'worker'
94
95
  try:
@@ -99,7 +100,9 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
99
100
  disk_size=config.node_config['DiskSize'],
100
101
  preemptible=config.node_config['Preemptible'],
101
102
  image_name=config.node_config['ImageId'],
102
- ports=config.ports_to_open_on_launch)
103
+ ports=config.ports_to_open_on_launch,
104
+ secure_only=secure_only,
105
+ )
103
106
  except Exception as e: # pylint: disable=broad-except
104
107
  logger.warning(f'run_instances error: {e}')
105
108
  raise
@@ -34,8 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
34
34
 
35
35
 
36
36
  def launch(name: str, instance_type: str, region: str, disk_size: int,
37
- image_name: str, ports: Optional[List[int]],
38
- preemptible: bool) -> str:
37
+ image_name: str, ports: Optional[List[int]], preemptible: bool,
38
+ secure_only: bool) -> str:
39
39
  """Launches an instance with the given parameters.
40
40
 
41
41
  Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -87,7 +87,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
87
87
  gpu_name = instance_type.split('-')[1].replace('_', ' ')
88
88
  num_gpus = int(instance_type.split('-')[0].replace('x', ''))
89
89
 
90
- query = ' '.join([
90
+ query = [
91
91
  'chunked=true',
92
92
  'georegion=true',
93
93
  f'geolocation="{region[-2:]}"',
@@ -95,13 +95,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
95
95
  f'num_gpus={num_gpus}',
96
96
  f'gpu_name="{gpu_name}"',
97
97
  f'cpu_ram>="{cpu_ram}"',
98
- ])
98
+ ]
99
+ if secure_only:
100
+ query.append('datacenter=true')
101
+ query.append('hosting_type>=1')
102
+ query_str = ' '.join(query)
99
103
 
100
- instance_list = vast.vast().search_offers(query=query)
104
+ instance_list = vast.vast().search_offers(query=query_str)
101
105
 
102
106
  if isinstance(instance_list, int) or len(instance_list) == 0:
103
107
  raise RuntimeError('Failed to create instances, could not find an '
104
- f'offer that satisfies the requirements "{query}".')
108
+ 'offer that satisfies the requirements '
109
+ f'"{query_str}".')
105
110
 
106
111
  instance_touse = instance_list[0]
107
112
 
sky/resources.py CHANGED
@@ -219,6 +219,9 @@ class Resources:
219
219
  - strategy: the recovery strategy to use.
220
220
  - max_restarts_on_errors: the max number of restarts on user code
221
221
  errors.
222
+ - recover_on_exit_codes: a list of exit codes that should trigger
223
+ job recovery. If any task exits with a code in this list, the job
224
+ will be recovered regardless of max_restarts_on_errors limit.
222
225
 
223
226
  region: the region to use. Deprecated. Use `infra` instead.
224
227
  zone: the zone to use. Deprecated. Use `infra` instead.
@@ -569,7 +572,8 @@ class Resources:
569
572
  if self.cloud is not None and self._instance_type is not None:
570
573
  vcpus, _ = self.cloud.get_vcpus_mem_from_instance_type(
571
574
  self._instance_type)
572
- return str(vcpus)
575
+ if vcpus is not None:
576
+ return str(vcpus)
573
577
  return None
574
578
 
575
579
  @property
@@ -1645,6 +1649,7 @@ class Resources:
1645
1649
  other: Union[List['Resources'], 'Resources'],
1646
1650
  requested_num_nodes: int = 1,
1647
1651
  check_ports: bool = False,
1652
+ check_cloud: bool = True,
1648
1653
  ) -> bool:
1649
1654
  """Returns whether this resources is less demanding than the other.
1650
1655
 
@@ -1654,24 +1659,29 @@ class Resources:
1654
1659
  requested_num_nodes: Number of nodes that the current task
1655
1660
  requests from the cluster.
1656
1661
  check_ports: Whether to check the ports field.
1662
+ check_cloud: Whether we check the cloud/region/zone fields. Useful
1663
+ for resources that don't have cloud specified, like some launched
1664
+ resources.
1657
1665
  """
1658
1666
  if isinstance(other, list):
1659
1667
  resources_list = [self.less_demanding_than(o) for o in other]
1660
1668
  return requested_num_nodes <= sum(resources_list)
1661
1669
 
1662
- assert other.cloud is not None, 'Other cloud must be specified'
1670
+ if check_cloud:
1671
+ assert other.cloud is not None, 'Other cloud must be specified'
1663
1672
 
1664
- if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
1665
- return False
1666
- # self.cloud <= other.cloud
1673
+ if self.cloud is not None and not self.cloud.is_same_cloud(
1674
+ other.cloud):
1675
+ return False
1676
+ # self.cloud <= other.cloud
1667
1677
 
1668
- if self.region is not None and self.region != other.region:
1669
- return False
1670
- # self.region <= other.region
1678
+ if self.region is not None and self.region != other.region:
1679
+ return False
1680
+ # self.region <= other.region
1671
1681
 
1672
- if self.zone is not None and self.zone != other.zone:
1673
- return False
1674
- # self.zone <= other.zone
1682
+ if self.zone is not None and self.zone != other.zone:
1683
+ return False
1684
+ # self.zone <= other.zone
1675
1685
 
1676
1686
  if self.image_id is not None:
1677
1687
  if other.image_id is None:
@@ -1743,8 +1753,10 @@ class Resources:
1743
1753
  # On Kubernetes, we can't launch a task that requires FUSE on a pod
1744
1754
  # that wasn't initialized with FUSE support at the start.
1745
1755
  # Other clouds don't have this limitation.
1746
- if other.cloud.is_same_cloud(clouds.Kubernetes()):
1747
- return False
1756
+ if check_cloud:
1757
+ assert other.cloud is not None
1758
+ if other.cloud.is_same_cloud(clouds.Kubernetes()):
1759
+ return False
1748
1760
 
1749
1761
  # self <= other
1750
1762
  return True
@@ -1792,6 +1804,101 @@ class Resources:
1792
1804
  self._docker_login_config is None,
1793
1805
  ])
1794
1806
 
1807
+ def __add__(self, other: Optional['Resources']) -> Optional['Resources']:
1808
+ """Add two Resources objects together.
1809
+
1810
+ Args:
1811
+ other: Another Resources object to add (may be None)
1812
+
1813
+ Returns:
1814
+ New Resources object with summed resources, or None if other is None
1815
+ """
1816
+ if other is None:
1817
+ return self
1818
+
1819
+ # Sum CPUs
1820
+ self_cpus = _parse_value(self.cpus)
1821
+ other_cpus = _parse_value(other.cpus)
1822
+ total_cpus = None
1823
+ if self_cpus is not None or other_cpus is not None:
1824
+ total_cpus = (self_cpus or 0) + (other_cpus or 0)
1825
+
1826
+ # Sum memory
1827
+ self_memory = _parse_value(self.memory)
1828
+ other_memory = _parse_value(other.memory)
1829
+ total_memory = None
1830
+ if self_memory is not None or other_memory is not None:
1831
+ total_memory = (self_memory or 0) + (other_memory or 0)
1832
+
1833
+ # Sum accelerators
1834
+ total_accelerators = {}
1835
+ if self.accelerators:
1836
+ for acc_type, count in self.accelerators.items():
1837
+ total_accelerators[acc_type] = float(count)
1838
+ if other.accelerators:
1839
+ for acc_type, count in other.accelerators.items():
1840
+ if acc_type not in total_accelerators:
1841
+ total_accelerators[acc_type] = 0
1842
+ total_accelerators[acc_type] += float(count)
1843
+
1844
+ return Resources(
1845
+ cpus=str(total_cpus) if total_cpus is not None else None,
1846
+ memory=str(total_memory) if total_memory is not None else None,
1847
+ accelerators=total_accelerators if total_accelerators else None)
1848
+
1849
+ def __sub__(self, other: Optional['Resources']) -> 'Resources':
1850
+ """Subtract another Resources object from this one.
1851
+
1852
+ Args:
1853
+ other: Resources to subtract (may be None)
1854
+
1855
+ Returns:
1856
+ New Resources object with subtracted resources. If the result for a
1857
+ resource is negative, it will be set to 0.
1858
+ """
1859
+ if other is None:
1860
+ return self
1861
+
1862
+ # Subtract CPUs
1863
+ self_cpus = _parse_value(self.cpus)
1864
+ other_cpus = _parse_value(other.cpus)
1865
+ free_cpus = None
1866
+ if self_cpus is not None:
1867
+ if other_cpus is not None:
1868
+ free_cpus = max(0, self_cpus - other_cpus)
1869
+ else:
1870
+ free_cpus = self_cpus
1871
+
1872
+ # Subtract memory
1873
+ self_memory = _parse_value(self.memory)
1874
+ other_memory = _parse_value(other.memory)
1875
+ free_memory = None
1876
+ if self_memory is not None:
1877
+ if other_memory is not None:
1878
+ free_memory = max(0, self_memory - other_memory)
1879
+ else:
1880
+ free_memory = self_memory
1881
+
1882
+ # Subtract accelerators
1883
+ free_accelerators = {}
1884
+ if self.accelerators:
1885
+ for acc_type, total_count in self.accelerators.items():
1886
+ used_count = (other.accelerators.get(acc_type, 0)
1887
+ if other.accelerators else 0)
1888
+ free_count = max(0, float(total_count) - float(used_count))
1889
+ if free_count > 0:
1890
+ free_accelerators[acc_type] = free_count
1891
+
1892
+ # If all resources are exhausted, return None
1893
+ # Check if we have any free resources
1894
+ free_cpus = None if free_cpus == 0 else free_cpus
1895
+ free_memory = None if free_memory == 0 else free_memory
1896
+ free_accelerators = None if not free_accelerators else free_accelerators
1897
+
1898
+ return Resources(cpus=free_cpus,
1899
+ memory=free_memory,
1900
+ accelerators=free_accelerators)
1901
+
1795
1902
  def copy(self, **override) -> 'Resources':
1796
1903
  """Returns a copy of the given Resources."""
1797
1904
  use_spot = self.use_spot if self._use_spot_specified else None
@@ -2456,3 +2563,18 @@ def _maybe_add_docker_prefix_to_image_id(
2456
2563
  for k, v in image_id_dict.items():
2457
2564
  if not v.startswith('docker:'):
2458
2565
  image_id_dict[k] = f'docker:{v}'
2566
+
2567
+
2568
+ def _parse_value(val):
2569
+ if val is None:
2570
+ return None
2571
+ if isinstance(val, (int, float)):
2572
+ return float(val)
2573
+ if isinstance(val, str):
2574
+ # Remove '+' suffix if present
2575
+ val = val.rstrip('+')
2576
+ try:
2577
+ return float(val)
2578
+ except ValueError:
2579
+ return None
2580
+ return None
@@ -123,6 +123,7 @@ class StatusResponse(ResponseBaseModel):
123
123
  cpus: Optional[str] = None
124
124
  memory: Optional[str] = None
125
125
  accelerators: Optional[str] = None
126
+ labels: Optional[Dict[str, str]] = None
126
127
  cluster_name_on_cloud: Optional[str] = None
127
128
 
128
129
 
@@ -203,6 +204,8 @@ class ManagedJobRecord(ResponseBaseModel):
203
204
  current_cluster_name: Optional[str] = None
204
205
  job_id_on_pool_cluster: Optional[int] = None
205
206
  accelerators: Optional[Dict[str, int]] = None
207
+ labels: Optional[Dict[str, str]] = None
208
+ links: Optional[Dict[str, str]] = None
206
209
 
207
210
 
208
211
  class VolumeRecord(ResponseBaseModel):
@@ -225,3 +228,4 @@ class VolumeRecord(ResponseBaseModel):
225
228
  usedby_pods: List[str]
226
229
  usedby_clusters: List[str]
227
230
  is_ephemeral: bool = False
231
+ usedby_fetch_failed: bool = False
@@ -21,7 +21,7 @@ depends_on: Union[str, Sequence[str], None] = None
21
21
 
22
22
 
23
23
  def upgrade():
24
- """Add last_activity_time and launched_at columns to cluster history."""
24
+ """Add ssh keys if it was not already added to global user state."""
25
25
  connection = op.get_bind()
26
26
 
27
27
  match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
@@ -0,0 +1,34 @@
1
+ """Add full_resources column to spot table.
2
+
3
+ Revision ID: 008
4
+ Revises: 007
5
+ Create Date: 2025-12-03
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '008'
18
+ down_revision: Union[str, Sequence[str], None] = '007'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add full_resources column to spot table."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('spot',
27
+ 'full_resources',
28
+ sa.JSON(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """No downgrade logic."""
34
+ pass
@@ -0,0 +1,32 @@
1
+ """Add job_events table for tracking managed job events.
2
+
3
+ Revision ID: 009
4
+ Revises: 008
5
+ Create Date: 2025-12-11
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+
13
+ from sky.jobs.state import Base
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '009'
18
+ down_revision: Union[str, Sequence[str], None] = '008'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Create job_events table for tracking job events."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
27
+ 'job_events')
28
+
29
+
30
+ def downgrade():
31
+ """Drop job_events table."""
32
+ pass
@@ -0,0 +1,43 @@
1
+ """Change job_events timestamp column to support timezone.
2
+
3
+ Revision ID: 010
4
+ Revises: 009
5
+ Create Date: 2025-12-22
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = '010'
16
+ down_revision: Union[str, Sequence[str], None] = '009'
17
+ branch_labels: Union[str, Sequence[str], None] = None
18
+ depends_on: Union[str, Sequence[str], None] = None
19
+
20
+
21
+ def upgrade():
22
+ """Change timestamp column to TIMESTAMP WITH TIME ZONE.
23
+
24
+ This only affects PostgreSQL - SQLite stores datetimes as text and handles
25
+ timezone-aware datetimes automatically.
26
+ """
27
+ bind = op.get_bind()
28
+
29
+ if bind.dialect.name == 'postgresql':
30
+ # For PostgreSQL, change TIMESTAMP to TIMESTAMPTZ
31
+ # The USING clause converts existing naive timestamps to UTC
32
+ with op.get_context().autocommit_block():
33
+ op.alter_column('job_events',
34
+ 'timestamp',
35
+ type_=sa.DateTime(timezone=True),
36
+ existing_type=sa.DateTime(timezone=False),
37
+ postgresql_using='timestamp AT TIME ZONE \'UTC\'')
38
+ # SQLite: no migration needed, timezone support is handled by SQLAlchemy
39
+
40
+
41
+ def downgrade():
42
+ """No downgrade logic."""
43
+ pass
@@ -0,0 +1,34 @@
1
+ """Add links column for storing cluster instance links.
2
+
3
+ Revision ID: 011
4
+ Revises: 010
5
+ Create Date: 2026-01-07
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '011'
18
+ down_revision: Union[str, Sequence[str], None] = '010'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add links column to store instance links as JSON."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('spot',
27
+ 'links',
28
+ sa.JSON(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """No downgrade logic."""
34
+ pass
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
14
14
 
15
15
 
16
16
 
17
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\x91\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponseb\x06proto3')
17
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"8\n\x16GetJobExitCodesRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"-\n\x17GetJobExitCodesResponse\x12\x12\n\nexit_codes\x18\x01 \x03(\x05*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\xe7\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponse\x12T\n\x0fGetJobExitCodes\x12\x1f.jobs.v1.GetJobExitCodesRequest\x1a .jobs.v1.GetJobExitCodesResponseb\x06proto3')
18
18
 
19
19
  _globals = globals()
20
20
  _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
25
25
  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
26
26
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
27
27
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
28
- _globals['_JOBSTATUS']._serialized_start=2219
29
- _globals['_JOBSTATUS']._serialized_end=2488
28
+ _globals['_JOBSTATUS']._serialized_start=2324
29
+ _globals['_JOBSTATUS']._serialized_end=2593
30
30
  _globals['_ADDJOBREQUEST']._serialized_start=48
31
31
  _globals['_ADDJOBREQUEST']._serialized_end=181
32
32
  _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -81,6 +81,10 @@ if not _descriptor._USE_C_DESCRIPTORS:
81
81
  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2216
82
82
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2167
83
83
  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2216
84
- _globals['_JOBSSERVICE']._serialized_start=2491
85
- _globals['_JOBSSERVICE']._serialized_end=3404
84
+ _globals['_GETJOBEXITCODESREQUEST']._serialized_start=2218
85
+ _globals['_GETJOBEXITCODESREQUEST']._serialized_end=2274
86
+ _globals['_GETJOBEXITCODESRESPONSE']._serialized_start=2276
87
+ _globals['_GETJOBEXITCODESRESPONSE']._serialized_end=2321
88
+ _globals['_JOBSSERVICE']._serialized_start=2596
89
+ _globals['_JOBSSERVICE']._serialized_end=3595
86
90
  # @@protoc_insertion_point(module_scope)
@@ -252,3 +252,15 @@ class GetLogDirsForJobsResponse(_message.Message):
252
252
  JOB_LOG_DIRS_FIELD_NUMBER: _ClassVar[int]
253
253
  job_log_dirs: _containers.ScalarMap[int, str]
254
254
  def __init__(self, job_log_dirs: _Optional[_Mapping[int, str]] = ...) -> None: ...
255
+
256
+ class GetJobExitCodesRequest(_message.Message):
257
+ __slots__ = ("job_id",)
258
+ JOB_ID_FIELD_NUMBER: _ClassVar[int]
259
+ job_id: int
260
+ def __init__(self, job_id: _Optional[int] = ...) -> None: ...
261
+
262
+ class GetJobExitCodesResponse(_message.Message):
263
+ __slots__ = ("exit_codes",)
264
+ EXIT_CODES_FIELD_NUMBER: _ClassVar[int]
265
+ exit_codes: _containers.RepeatedScalarFieldContainer[int]
266
+ def __init__(self, exit_codes: _Optional[_Iterable[int]] = ...) -> None: ...
@@ -94,6 +94,11 @@ class JobsServiceStub(object):
94
94
  request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.SerializeToString,
95
95
  response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.FromString,
96
96
  _registered_method=True)
97
+ self.GetJobExitCodes = channel.unary_unary(
98
+ '/jobs.v1.JobsService/GetJobExitCodes',
99
+ request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.SerializeToString,
100
+ response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.FromString,
101
+ _registered_method=True)
97
102
 
98
103
 
99
104
  class JobsServiceServicer(object):
@@ -176,6 +181,13 @@ class JobsServiceServicer(object):
176
181
  context.set_details('Method not implemented!')
177
182
  raise NotImplementedError('Method not implemented!')
178
183
 
184
+ def GetJobExitCodes(self, request, context):
185
+ """Get job exit codes.
186
+ """
187
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
188
+ context.set_details('Method not implemented!')
189
+ raise NotImplementedError('Method not implemented!')
190
+
179
191
 
180
192
  def add_JobsServiceServicer_to_server(servicer, server):
181
193
  rpc_method_handlers = {
@@ -234,6 +246,11 @@ def add_JobsServiceServicer_to_server(servicer, server):
234
246
  request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.FromString,
235
247
  response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.SerializeToString,
236
248
  ),
249
+ 'GetJobExitCodes': grpc.unary_unary_rpc_method_handler(
250
+ servicer.GetJobExitCodes,
251
+ request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.FromString,
252
+ response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.SerializeToString,
253
+ ),
237
254
  }
238
255
  generic_handler = grpc.method_handlers_generic_handler(
239
256
  'jobs.v1.JobsService', rpc_method_handlers)
@@ -540,3 +557,30 @@ class JobsService(object):
540
557
  timeout,
541
558
  metadata,
542
559
  _registered_method=True)
560
+
561
+ @staticmethod
562
+ def GetJobExitCodes(request,
563
+ target,
564
+ options=(),
565
+ channel_credentials=None,
566
+ call_credentials=None,
567
+ insecure=False,
568
+ compression=None,
569
+ wait_for_ready=None,
570
+ timeout=None,
571
+ metadata=None):
572
+ return grpc.experimental.unary_unary(
573
+ request,
574
+ target,
575
+ '/jobs.v1.JobsService/GetJobExitCodes',
576
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.SerializeToString,
577
+ sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.FromString,
578
+ options,
579
+ channel_credentials,
580
+ insecure,
581
+ call_credentials,
582
+ compression,
583
+ wait_for_ready,
584
+ timeout,
585
+ metadata,
586
+ _registered_method=True)