skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/provision/vast/instance.py
CHANGED
|
@@ -89,6 +89,7 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
89
89
|
resumed_instance_ids=[],
|
|
90
90
|
created_instance_ids=[])
|
|
91
91
|
|
|
92
|
+
secure_only = config.provider_config.get('secure_only', False)
|
|
92
93
|
for _ in range(to_start_count):
|
|
93
94
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
94
95
|
try:
|
|
@@ -99,7 +100,9 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
99
100
|
disk_size=config.node_config['DiskSize'],
|
|
100
101
|
preemptible=config.node_config['Preemptible'],
|
|
101
102
|
image_name=config.node_config['ImageId'],
|
|
102
|
-
ports=config.ports_to_open_on_launch
|
|
103
|
+
ports=config.ports_to_open_on_launch,
|
|
104
|
+
secure_only=secure_only,
|
|
105
|
+
)
|
|
103
106
|
except Exception as e: # pylint: disable=broad-except
|
|
104
107
|
logger.warning(f'run_instances error: {e}')
|
|
105
108
|
raise
|
sky/provision/vast/utils.py
CHANGED
|
@@ -34,8 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
37
|
-
image_name: str, ports: Optional[List[int]],
|
|
38
|
-
|
|
37
|
+
image_name: str, ports: Optional[List[int]], preemptible: bool,
|
|
38
|
+
secure_only: bool) -> str:
|
|
39
39
|
"""Launches an instance with the given parameters.
|
|
40
40
|
|
|
41
41
|
Converts the instance_type to the Vast GPU name, finds the specs for the
|
|
@@ -87,7 +87,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
87
87
|
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
|
88
88
|
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
|
89
89
|
|
|
90
|
-
query =
|
|
90
|
+
query = [
|
|
91
91
|
'chunked=true',
|
|
92
92
|
'georegion=true',
|
|
93
93
|
f'geolocation="{region[-2:]}"',
|
|
@@ -95,13 +95,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
95
95
|
f'num_gpus={num_gpus}',
|
|
96
96
|
f'gpu_name="{gpu_name}"',
|
|
97
97
|
f'cpu_ram>="{cpu_ram}"',
|
|
98
|
-
]
|
|
98
|
+
]
|
|
99
|
+
if secure_only:
|
|
100
|
+
query.append('datacenter=true')
|
|
101
|
+
query.append('hosting_type>=1')
|
|
102
|
+
query_str = ' '.join(query)
|
|
99
103
|
|
|
100
|
-
instance_list = vast.vast().search_offers(query=
|
|
104
|
+
instance_list = vast.vast().search_offers(query=query_str)
|
|
101
105
|
|
|
102
106
|
if isinstance(instance_list, int) or len(instance_list) == 0:
|
|
103
107
|
raise RuntimeError('Failed to create instances, could not find an '
|
|
104
|
-
|
|
108
|
+
'offer that satisfies the requirements '
|
|
109
|
+
f'"{query_str}".')
|
|
105
110
|
|
|
106
111
|
instance_touse = instance_list[0]
|
|
107
112
|
|
sky/resources.py
CHANGED
|
@@ -219,6 +219,9 @@ class Resources:
|
|
|
219
219
|
- strategy: the recovery strategy to use.
|
|
220
220
|
- max_restarts_on_errors: the max number of restarts on user code
|
|
221
221
|
errors.
|
|
222
|
+
- recover_on_exit_codes: a list of exit codes that should trigger
|
|
223
|
+
job recovery. If any task exits with a code in this list, the job
|
|
224
|
+
will be recovered regardless of max_restarts_on_errors limit.
|
|
222
225
|
|
|
223
226
|
region: the region to use. Deprecated. Use `infra` instead.
|
|
224
227
|
zone: the zone to use. Deprecated. Use `infra` instead.
|
|
@@ -569,7 +572,8 @@ class Resources:
|
|
|
569
572
|
if self.cloud is not None and self._instance_type is not None:
|
|
570
573
|
vcpus, _ = self.cloud.get_vcpus_mem_from_instance_type(
|
|
571
574
|
self._instance_type)
|
|
572
|
-
|
|
575
|
+
if vcpus is not None:
|
|
576
|
+
return str(vcpus)
|
|
573
577
|
return None
|
|
574
578
|
|
|
575
579
|
@property
|
|
@@ -1645,6 +1649,7 @@ class Resources:
|
|
|
1645
1649
|
other: Union[List['Resources'], 'Resources'],
|
|
1646
1650
|
requested_num_nodes: int = 1,
|
|
1647
1651
|
check_ports: bool = False,
|
|
1652
|
+
check_cloud: bool = True,
|
|
1648
1653
|
) -> bool:
|
|
1649
1654
|
"""Returns whether this resources is less demanding than the other.
|
|
1650
1655
|
|
|
@@ -1654,24 +1659,29 @@ class Resources:
|
|
|
1654
1659
|
requested_num_nodes: Number of nodes that the current task
|
|
1655
1660
|
requests from the cluster.
|
|
1656
1661
|
check_ports: Whether to check the ports field.
|
|
1662
|
+
check_cloud: Whether we check the cloud/region/zone fields. Useful
|
|
1663
|
+
for resources that don't have cloud specified, like some launched
|
|
1664
|
+
resources.
|
|
1657
1665
|
"""
|
|
1658
1666
|
if isinstance(other, list):
|
|
1659
1667
|
resources_list = [self.less_demanding_than(o) for o in other]
|
|
1660
1668
|
return requested_num_nodes <= sum(resources_list)
|
|
1661
1669
|
|
|
1662
|
-
|
|
1670
|
+
if check_cloud:
|
|
1671
|
+
assert other.cloud is not None, 'Other cloud must be specified'
|
|
1663
1672
|
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1673
|
+
if self.cloud is not None and not self.cloud.is_same_cloud(
|
|
1674
|
+
other.cloud):
|
|
1675
|
+
return False
|
|
1676
|
+
# self.cloud <= other.cloud
|
|
1667
1677
|
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1678
|
+
if self.region is not None and self.region != other.region:
|
|
1679
|
+
return False
|
|
1680
|
+
# self.region <= other.region
|
|
1671
1681
|
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1682
|
+
if self.zone is not None and self.zone != other.zone:
|
|
1683
|
+
return False
|
|
1684
|
+
# self.zone <= other.zone
|
|
1675
1685
|
|
|
1676
1686
|
if self.image_id is not None:
|
|
1677
1687
|
if other.image_id is None:
|
|
@@ -1743,8 +1753,10 @@ class Resources:
|
|
|
1743
1753
|
# On Kubernetes, we can't launch a task that requires FUSE on a pod
|
|
1744
1754
|
# that wasn't initialized with FUSE support at the start.
|
|
1745
1755
|
# Other clouds don't have this limitation.
|
|
1746
|
-
if
|
|
1747
|
-
|
|
1756
|
+
if check_cloud:
|
|
1757
|
+
assert other.cloud is not None
|
|
1758
|
+
if other.cloud.is_same_cloud(clouds.Kubernetes()):
|
|
1759
|
+
return False
|
|
1748
1760
|
|
|
1749
1761
|
# self <= other
|
|
1750
1762
|
return True
|
|
@@ -1792,6 +1804,101 @@ class Resources:
|
|
|
1792
1804
|
self._docker_login_config is None,
|
|
1793
1805
|
])
|
|
1794
1806
|
|
|
1807
|
+
def __add__(self, other: Optional['Resources']) -> Optional['Resources']:
|
|
1808
|
+
"""Add two Resources objects together.
|
|
1809
|
+
|
|
1810
|
+
Args:
|
|
1811
|
+
other: Another Resources object to add (may be None)
|
|
1812
|
+
|
|
1813
|
+
Returns:
|
|
1814
|
+
New Resources object with summed resources, or None if other is None
|
|
1815
|
+
"""
|
|
1816
|
+
if other is None:
|
|
1817
|
+
return self
|
|
1818
|
+
|
|
1819
|
+
# Sum CPUs
|
|
1820
|
+
self_cpus = _parse_value(self.cpus)
|
|
1821
|
+
other_cpus = _parse_value(other.cpus)
|
|
1822
|
+
total_cpus = None
|
|
1823
|
+
if self_cpus is not None or other_cpus is not None:
|
|
1824
|
+
total_cpus = (self_cpus or 0) + (other_cpus or 0)
|
|
1825
|
+
|
|
1826
|
+
# Sum memory
|
|
1827
|
+
self_memory = _parse_value(self.memory)
|
|
1828
|
+
other_memory = _parse_value(other.memory)
|
|
1829
|
+
total_memory = None
|
|
1830
|
+
if self_memory is not None or other_memory is not None:
|
|
1831
|
+
total_memory = (self_memory or 0) + (other_memory or 0)
|
|
1832
|
+
|
|
1833
|
+
# Sum accelerators
|
|
1834
|
+
total_accelerators = {}
|
|
1835
|
+
if self.accelerators:
|
|
1836
|
+
for acc_type, count in self.accelerators.items():
|
|
1837
|
+
total_accelerators[acc_type] = float(count)
|
|
1838
|
+
if other.accelerators:
|
|
1839
|
+
for acc_type, count in other.accelerators.items():
|
|
1840
|
+
if acc_type not in total_accelerators:
|
|
1841
|
+
total_accelerators[acc_type] = 0
|
|
1842
|
+
total_accelerators[acc_type] += float(count)
|
|
1843
|
+
|
|
1844
|
+
return Resources(
|
|
1845
|
+
cpus=str(total_cpus) if total_cpus is not None else None,
|
|
1846
|
+
memory=str(total_memory) if total_memory is not None else None,
|
|
1847
|
+
accelerators=total_accelerators if total_accelerators else None)
|
|
1848
|
+
|
|
1849
|
+
def __sub__(self, other: Optional['Resources']) -> 'Resources':
|
|
1850
|
+
"""Subtract another Resources object from this one.
|
|
1851
|
+
|
|
1852
|
+
Args:
|
|
1853
|
+
other: Resources to subtract (may be None)
|
|
1854
|
+
|
|
1855
|
+
Returns:
|
|
1856
|
+
New Resources object with subtracted resources. If the result for a
|
|
1857
|
+
resource is negative, it will be set to 0.
|
|
1858
|
+
"""
|
|
1859
|
+
if other is None:
|
|
1860
|
+
return self
|
|
1861
|
+
|
|
1862
|
+
# Subtract CPUs
|
|
1863
|
+
self_cpus = _parse_value(self.cpus)
|
|
1864
|
+
other_cpus = _parse_value(other.cpus)
|
|
1865
|
+
free_cpus = None
|
|
1866
|
+
if self_cpus is not None:
|
|
1867
|
+
if other_cpus is not None:
|
|
1868
|
+
free_cpus = max(0, self_cpus - other_cpus)
|
|
1869
|
+
else:
|
|
1870
|
+
free_cpus = self_cpus
|
|
1871
|
+
|
|
1872
|
+
# Subtract memory
|
|
1873
|
+
self_memory = _parse_value(self.memory)
|
|
1874
|
+
other_memory = _parse_value(other.memory)
|
|
1875
|
+
free_memory = None
|
|
1876
|
+
if self_memory is not None:
|
|
1877
|
+
if other_memory is not None:
|
|
1878
|
+
free_memory = max(0, self_memory - other_memory)
|
|
1879
|
+
else:
|
|
1880
|
+
free_memory = self_memory
|
|
1881
|
+
|
|
1882
|
+
# Subtract accelerators
|
|
1883
|
+
free_accelerators = {}
|
|
1884
|
+
if self.accelerators:
|
|
1885
|
+
for acc_type, total_count in self.accelerators.items():
|
|
1886
|
+
used_count = (other.accelerators.get(acc_type, 0)
|
|
1887
|
+
if other.accelerators else 0)
|
|
1888
|
+
free_count = max(0, float(total_count) - float(used_count))
|
|
1889
|
+
if free_count > 0:
|
|
1890
|
+
free_accelerators[acc_type] = free_count
|
|
1891
|
+
|
|
1892
|
+
# If all resources are exhausted, return None
|
|
1893
|
+
# Check if we have any free resources
|
|
1894
|
+
free_cpus = None if free_cpus == 0 else free_cpus
|
|
1895
|
+
free_memory = None if free_memory == 0 else free_memory
|
|
1896
|
+
free_accelerators = None if not free_accelerators else free_accelerators
|
|
1897
|
+
|
|
1898
|
+
return Resources(cpus=free_cpus,
|
|
1899
|
+
memory=free_memory,
|
|
1900
|
+
accelerators=free_accelerators)
|
|
1901
|
+
|
|
1795
1902
|
def copy(self, **override) -> 'Resources':
|
|
1796
1903
|
"""Returns a copy of the given Resources."""
|
|
1797
1904
|
use_spot = self.use_spot if self._use_spot_specified else None
|
|
@@ -2456,3 +2563,18 @@ def _maybe_add_docker_prefix_to_image_id(
|
|
|
2456
2563
|
for k, v in image_id_dict.items():
|
|
2457
2564
|
if not v.startswith('docker:'):
|
|
2458
2565
|
image_id_dict[k] = f'docker:{v}'
|
|
2566
|
+
|
|
2567
|
+
|
|
2568
|
+
def _parse_value(val):
|
|
2569
|
+
if val is None:
|
|
2570
|
+
return None
|
|
2571
|
+
if isinstance(val, (int, float)):
|
|
2572
|
+
return float(val)
|
|
2573
|
+
if isinstance(val, str):
|
|
2574
|
+
# Remove '+' suffix if present
|
|
2575
|
+
val = val.rstrip('+')
|
|
2576
|
+
try:
|
|
2577
|
+
return float(val)
|
|
2578
|
+
except ValueError:
|
|
2579
|
+
return None
|
|
2580
|
+
return None
|
sky/schemas/api/responses.py
CHANGED
|
@@ -123,6 +123,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
123
123
|
cpus: Optional[str] = None
|
|
124
124
|
memory: Optional[str] = None
|
|
125
125
|
accelerators: Optional[str] = None
|
|
126
|
+
labels: Optional[Dict[str, str]] = None
|
|
126
127
|
cluster_name_on_cloud: Optional[str] = None
|
|
127
128
|
|
|
128
129
|
|
|
@@ -203,6 +204,8 @@ class ManagedJobRecord(ResponseBaseModel):
|
|
|
203
204
|
current_cluster_name: Optional[str] = None
|
|
204
205
|
job_id_on_pool_cluster: Optional[int] = None
|
|
205
206
|
accelerators: Optional[Dict[str, int]] = None
|
|
207
|
+
labels: Optional[Dict[str, str]] = None
|
|
208
|
+
links: Optional[Dict[str, str]] = None
|
|
206
209
|
|
|
207
210
|
|
|
208
211
|
class VolumeRecord(ResponseBaseModel):
|
|
@@ -225,3 +228,4 @@ class VolumeRecord(ResponseBaseModel):
|
|
|
225
228
|
usedby_pods: List[str]
|
|
226
229
|
usedby_clusters: List[str]
|
|
227
230
|
is_ephemeral: bool = False
|
|
231
|
+
usedby_fetch_failed: bool = False
|
|
@@ -21,7 +21,7 @@ depends_on: Union[str, Sequence[str], None] = None
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def upgrade():
|
|
24
|
-
"""Add
|
|
24
|
+
"""Add ssh keys if it was not already added to global user state."""
|
|
25
25
|
connection = op.get_bind()
|
|
26
26
|
|
|
27
27
|
match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add full_resources column to spot table.
|
|
2
|
+
|
|
3
|
+
Revision ID: 008
|
|
4
|
+
Revises: 007
|
|
5
|
+
Create Date: 2025-12-03
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '008'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '007'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add full_resources column to spot table."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('spot',
|
|
27
|
+
'full_resources',
|
|
28
|
+
sa.JSON(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""No downgrade logic."""
|
|
34
|
+
pass
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Add job_events table for tracking managed job events.
|
|
2
|
+
|
|
3
|
+
Revision ID: 009
|
|
4
|
+
Revises: 008
|
|
5
|
+
Create Date: 2025-12-11
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
from sky.jobs.state import Base
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '009'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '008'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Create job_events table for tracking job events."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
|
|
27
|
+
'job_events')
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def downgrade():
|
|
31
|
+
"""Drop job_events table."""
|
|
32
|
+
pass
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Change job_events timestamp column to support timezone.
|
|
2
|
+
|
|
3
|
+
Revision ID: 010
|
|
4
|
+
Revises: 009
|
|
5
|
+
Create Date: 2025-12-22
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision: str = '010'
|
|
16
|
+
down_revision: Union[str, Sequence[str], None] = '009'
|
|
17
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
18
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade():
|
|
22
|
+
"""Change timestamp column to TIMESTAMP WITH TIME ZONE.
|
|
23
|
+
|
|
24
|
+
This only affects PostgreSQL - SQLite stores datetimes as text and handles
|
|
25
|
+
timezone-aware datetimes automatically.
|
|
26
|
+
"""
|
|
27
|
+
bind = op.get_bind()
|
|
28
|
+
|
|
29
|
+
if bind.dialect.name == 'postgresql':
|
|
30
|
+
# For PostgreSQL, change TIMESTAMP to TIMESTAMPTZ
|
|
31
|
+
# The USING clause converts existing naive timestamps to UTC
|
|
32
|
+
with op.get_context().autocommit_block():
|
|
33
|
+
op.alter_column('job_events',
|
|
34
|
+
'timestamp',
|
|
35
|
+
type_=sa.DateTime(timezone=True),
|
|
36
|
+
existing_type=sa.DateTime(timezone=False),
|
|
37
|
+
postgresql_using='timestamp AT TIME ZONE \'UTC\'')
|
|
38
|
+
# SQLite: no migration needed, timezone support is handled by SQLAlchemy
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def downgrade():
|
|
42
|
+
"""No downgrade logic."""
|
|
43
|
+
pass
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add links column for storing cluster instance links.
|
|
2
|
+
|
|
3
|
+
Revision ID: 011
|
|
4
|
+
Revises: 010
|
|
5
|
+
Create Date: 2026-01-07
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '011'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '010'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add links column to store instance links as JSON."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('spot',
|
|
27
|
+
'links',
|
|
28
|
+
sa.JSON(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""No downgrade logic."""
|
|
34
|
+
pass
|
|
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"8\n\x16GetJobExitCodesRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"-\n\x17GetJobExitCodesResponse\x12\x12\n\nexit_codes\x18\x01 \x03(\x05*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\xe7\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponse\x12T\n\x0fGetJobExitCodes\x12\x1f.jobs.v1.GetJobExitCodesRequest\x1a .jobs.v1.GetJobExitCodesResponseb\x06proto3')
|
|
18
18
|
|
|
19
19
|
_globals = globals()
|
|
20
20
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
@@ -25,8 +25,8 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
25
25
|
_globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
|
|
26
26
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
|
|
27
27
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
|
|
28
|
-
_globals['_JOBSTATUS']._serialized_start=
|
|
29
|
-
_globals['_JOBSTATUS']._serialized_end=
|
|
28
|
+
_globals['_JOBSTATUS']._serialized_start=2324
|
|
29
|
+
_globals['_JOBSTATUS']._serialized_end=2593
|
|
30
30
|
_globals['_ADDJOBREQUEST']._serialized_start=48
|
|
31
31
|
_globals['_ADDJOBREQUEST']._serialized_end=181
|
|
32
32
|
_globals['_ADDJOBRESPONSE']._serialized_start=183
|
|
@@ -81,6 +81,10 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
81
81
|
_globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2216
|
|
82
82
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2167
|
|
83
83
|
_globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2216
|
|
84
|
-
_globals['
|
|
85
|
-
_globals['
|
|
84
|
+
_globals['_GETJOBEXITCODESREQUEST']._serialized_start=2218
|
|
85
|
+
_globals['_GETJOBEXITCODESREQUEST']._serialized_end=2274
|
|
86
|
+
_globals['_GETJOBEXITCODESRESPONSE']._serialized_start=2276
|
|
87
|
+
_globals['_GETJOBEXITCODESRESPONSE']._serialized_end=2321
|
|
88
|
+
_globals['_JOBSSERVICE']._serialized_start=2596
|
|
89
|
+
_globals['_JOBSSERVICE']._serialized_end=3595
|
|
86
90
|
# @@protoc_insertion_point(module_scope)
|
|
@@ -252,3 +252,15 @@ class GetLogDirsForJobsResponse(_message.Message):
|
|
|
252
252
|
JOB_LOG_DIRS_FIELD_NUMBER: _ClassVar[int]
|
|
253
253
|
job_log_dirs: _containers.ScalarMap[int, str]
|
|
254
254
|
def __init__(self, job_log_dirs: _Optional[_Mapping[int, str]] = ...) -> None: ...
|
|
255
|
+
|
|
256
|
+
class GetJobExitCodesRequest(_message.Message):
|
|
257
|
+
__slots__ = ("job_id",)
|
|
258
|
+
JOB_ID_FIELD_NUMBER: _ClassVar[int]
|
|
259
|
+
job_id: int
|
|
260
|
+
def __init__(self, job_id: _Optional[int] = ...) -> None: ...
|
|
261
|
+
|
|
262
|
+
class GetJobExitCodesResponse(_message.Message):
|
|
263
|
+
__slots__ = ("exit_codes",)
|
|
264
|
+
EXIT_CODES_FIELD_NUMBER: _ClassVar[int]
|
|
265
|
+
exit_codes: _containers.RepeatedScalarFieldContainer[int]
|
|
266
|
+
def __init__(self, exit_codes: _Optional[_Iterable[int]] = ...) -> None: ...
|
|
@@ -94,6 +94,11 @@ class JobsServiceStub(object):
|
|
|
94
94
|
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.SerializeToString,
|
|
95
95
|
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.FromString,
|
|
96
96
|
_registered_method=True)
|
|
97
|
+
self.GetJobExitCodes = channel.unary_unary(
|
|
98
|
+
'/jobs.v1.JobsService/GetJobExitCodes',
|
|
99
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.SerializeToString,
|
|
100
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.FromString,
|
|
101
|
+
_registered_method=True)
|
|
97
102
|
|
|
98
103
|
|
|
99
104
|
class JobsServiceServicer(object):
|
|
@@ -176,6 +181,13 @@ class JobsServiceServicer(object):
|
|
|
176
181
|
context.set_details('Method not implemented!')
|
|
177
182
|
raise NotImplementedError('Method not implemented!')
|
|
178
183
|
|
|
184
|
+
def GetJobExitCodes(self, request, context):
|
|
185
|
+
"""Get job exit codes.
|
|
186
|
+
"""
|
|
187
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
188
|
+
context.set_details('Method not implemented!')
|
|
189
|
+
raise NotImplementedError('Method not implemented!')
|
|
190
|
+
|
|
179
191
|
|
|
180
192
|
def add_JobsServiceServicer_to_server(servicer, server):
|
|
181
193
|
rpc_method_handlers = {
|
|
@@ -234,6 +246,11 @@ def add_JobsServiceServicer_to_server(servicer, server):
|
|
|
234
246
|
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsRequest.FromString,
|
|
235
247
|
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetLogDirsForJobsResponse.SerializeToString,
|
|
236
248
|
),
|
|
249
|
+
'GetJobExitCodes': grpc.unary_unary_rpc_method_handler(
|
|
250
|
+
servicer.GetJobExitCodes,
|
|
251
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.FromString,
|
|
252
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.SerializeToString,
|
|
253
|
+
),
|
|
237
254
|
}
|
|
238
255
|
generic_handler = grpc.method_handlers_generic_handler(
|
|
239
256
|
'jobs.v1.JobsService', rpc_method_handlers)
|
|
@@ -540,3 +557,30 @@ class JobsService(object):
|
|
|
540
557
|
timeout,
|
|
541
558
|
metadata,
|
|
542
559
|
_registered_method=True)
|
|
560
|
+
|
|
561
|
+
@staticmethod
|
|
562
|
+
def GetJobExitCodes(request,
|
|
563
|
+
target,
|
|
564
|
+
options=(),
|
|
565
|
+
channel_credentials=None,
|
|
566
|
+
call_credentials=None,
|
|
567
|
+
insecure=False,
|
|
568
|
+
compression=None,
|
|
569
|
+
wait_for_ready=None,
|
|
570
|
+
timeout=None,
|
|
571
|
+
metadata=None):
|
|
572
|
+
return grpc.experimental.unary_unary(
|
|
573
|
+
request,
|
|
574
|
+
target,
|
|
575
|
+
'/jobs.v1.JobsService/GetJobExitCodes',
|
|
576
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.SerializeToString,
|
|
577
|
+
sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.FromString,
|
|
578
|
+
options,
|
|
579
|
+
channel_credentials,
|
|
580
|
+
insecure,
|
|
581
|
+
call_credentials,
|
|
582
|
+
compression,
|
|
583
|
+
wait_for_ready,
|
|
584
|
+
timeout,
|
|
585
|
+
metadata,
|
|
586
|
+
_registered_method=True)
|