skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -141,6 +141,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
|
|
|
141
141
|
clouds.OCI: 300,
|
|
142
142
|
clouds.Paperspace: 600,
|
|
143
143
|
clouds.Kubernetes: 300,
|
|
144
|
+
clouds.Shadeform: 300,
|
|
144
145
|
clouds.Vsphere: 240,
|
|
145
146
|
}
|
|
146
147
|
|
|
@@ -211,6 +212,7 @@ _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
|
211
212
|
('too long', 255),
|
|
212
213
|
('request-uri too large', 1),
|
|
213
214
|
('request header fields too large', 1),
|
|
215
|
+
('400 bad request', 1), # CloudFlare 400 error
|
|
214
216
|
]
|
|
215
217
|
|
|
216
218
|
_RESOURCES_UNAVAILABLE_LOG = (
|
|
@@ -303,6 +305,7 @@ def _get_cluster_config_template(cloud):
|
|
|
303
305
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
|
304
306
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
|
305
307
|
clouds.SSH: 'kubernetes-ray.yml.j2',
|
|
308
|
+
clouds.Shadeform: 'shadeform-ray.yml.j2',
|
|
306
309
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
|
307
310
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
308
311
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
@@ -602,7 +605,11 @@ class RayCodeGen:
|
|
|
602
605
|
# skip the scheduling step.
|
|
603
606
|
job_lib.scheduler.schedule_step()
|
|
604
607
|
|
|
605
|
-
|
|
608
|
+
# If some nodes are down and then new nodes are added after launching again,
|
|
609
|
+
# the result of `ray.nodes()` will include all the nodes, so we need to get
|
|
610
|
+
# the alive nodes.
|
|
611
|
+
alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
|
|
612
|
+
total_num_nodes = len(alive_nodes)
|
|
606
613
|
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
607
614
|
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
608
615
|
setup_workers = [run_bash_command_with_log_and_return_pid \\
|
|
@@ -2362,9 +2369,8 @@ class RetryingVmProvisioner(object):
|
|
|
2362
2369
|
for (resource, exception) in resource_exceptions.items():
|
|
2363
2370
|
table.add_row([
|
|
2364
2371
|
resource.infra.formatted_str(),
|
|
2365
|
-
resources_utils.format_resource(
|
|
2366
|
-
|
|
2367
|
-
exception
|
|
2372
|
+
resources_utils.format_resource(
|
|
2373
|
+
resource, simplified_only=True)[0], exception
|
|
2368
2374
|
])
|
|
2369
2375
|
# Set the max width of REASON column to 80 to avoid the table
|
|
2370
2376
|
# being wrapped in a unreadable way.
|
|
@@ -2464,6 +2470,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2464
2470
|
def get_cluster_name(self):
|
|
2465
2471
|
return self.cluster_name
|
|
2466
2472
|
|
|
2473
|
+
def get_cluster_name_on_cloud(self):
|
|
2474
|
+
return self.cluster_name_on_cloud
|
|
2475
|
+
|
|
2467
2476
|
def _use_internal_ips(self):
|
|
2468
2477
|
"""Returns whether to use internal IPs for SSH connections."""
|
|
2469
2478
|
# Directly load the `use_internal_ips` flag from the cluster yaml
|
|
@@ -2800,6 +2809,18 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2800
2809
|
self.cluster_name,
|
|
2801
2810
|
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2802
2811
|
|
|
2812
|
+
def close_skylet_ssh_tunnel(self) -> None:
|
|
2813
|
+
"""Terminate the SSH tunnel process and clear its metadata."""
|
|
2814
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2815
|
+
if tunnel is None:
|
|
2816
|
+
return
|
|
2817
|
+
logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
|
|
2818
|
+
self.cluster_name, tunnel.port)
|
|
2819
|
+
try:
|
|
2820
|
+
self._terminate_ssh_tunnel_process(tunnel)
|
|
2821
|
+
finally:
|
|
2822
|
+
self._set_skylet_ssh_tunnel(None)
|
|
2823
|
+
|
|
2803
2824
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2804
2825
|
grpc_options = [
|
|
2805
2826
|
# The task YAMLs can be large, so the default
|
|
@@ -2825,7 +2846,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2825
2846
|
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2826
2847
|
options=grpc_options)
|
|
2827
2848
|
except socket.error as e:
|
|
2828
|
-
logger.
|
|
2849
|
+
logger.debug(
|
|
2829
2850
|
'Failed to connect to SSH tunnel for cluster '
|
|
2830
2851
|
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2831
2852
|
'acquiring lock')
|
|
@@ -2851,7 +2872,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2851
2872
|
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2852
2873
|
options=grpc_options)
|
|
2853
2874
|
except socket.error as e:
|
|
2854
|
-
logger.
|
|
2875
|
+
logger.debug(
|
|
2855
2876
|
'Failed to connect to SSH tunnel for cluster '
|
|
2856
2877
|
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2857
2878
|
'opening new tunnel')
|
|
@@ -2866,19 +2887,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2866
2887
|
f'the lock at {lock_id}. '
|
|
2867
2888
|
f'{common_utils.format_exception(e)}') from e
|
|
2868
2889
|
|
|
2869
|
-
def
|
|
2870
|
-
"""
|
|
2890
|
+
def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2891
|
+
"""Terminate the SSH tunnel process."""
|
|
2871
2892
|
try:
|
|
2872
2893
|
proc = psutil.Process(tunnel_info.pid)
|
|
2873
2894
|
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2874
2895
|
logger.debug(
|
|
2875
2896
|
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2876
|
-
proc.
|
|
2877
|
-
try:
|
|
2878
|
-
proc.wait(timeout=3)
|
|
2879
|
-
except psutil.TimeoutExpired:
|
|
2880
|
-
proc.kill()
|
|
2881
|
-
proc.wait(timeout=1)
|
|
2897
|
+
subprocess_utils.kill_children_processes(proc.pid)
|
|
2882
2898
|
except psutil.NoSuchProcess:
|
|
2883
2899
|
pass
|
|
2884
2900
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -2924,17 +2940,17 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2924
2940
|
# Clean up existing tunnel before setting up the new one.
|
|
2925
2941
|
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2926
2942
|
if old_tunnel is not None:
|
|
2927
|
-
self.
|
|
2943
|
+
self._terminate_ssh_tunnel_process(old_tunnel)
|
|
2928
2944
|
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2929
2945
|
return tunnel_info
|
|
2930
2946
|
except grpc.FutureTimeoutError as e:
|
|
2931
|
-
self.
|
|
2947
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2932
2948
|
logger.warning(
|
|
2933
2949
|
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2934
2950
|
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2935
2951
|
raise e
|
|
2936
2952
|
except Exception as e:
|
|
2937
|
-
self.
|
|
2953
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2938
2954
|
raise e
|
|
2939
2955
|
|
|
2940
2956
|
@property
|
|
@@ -2947,6 +2963,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2947
2963
|
def cluster_yaml(self, value: Optional[str]):
|
|
2948
2964
|
self._cluster_yaml = value
|
|
2949
2965
|
|
|
2966
|
+
@property
|
|
2967
|
+
def instance_ids(self):
|
|
2968
|
+
if self.cached_cluster_info is not None:
|
|
2969
|
+
return self.cached_cluster_info.instance_ids()
|
|
2970
|
+
return None
|
|
2971
|
+
|
|
2950
2972
|
@property
|
|
2951
2973
|
def ssh_user(self):
|
|
2952
2974
|
if self.cached_cluster_info is not None:
|
|
@@ -3616,9 +3638,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3616
3638
|
gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
|
|
3617
3639
|
retry_message = ux_utils.retry_message(
|
|
3618
3640
|
f'Retry after {gap_seconds:.0f}s ')
|
|
3619
|
-
hint_message = (
|
|
3620
|
-
|
|
3621
|
-
|
|
3641
|
+
hint_message = (
|
|
3642
|
+
f'\n{retry_message} '
|
|
3643
|
+
f'{ux_utils.provision_hint(cluster_name)}'
|
|
3644
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3622
3645
|
|
|
3623
3646
|
# Add cluster event for retry.
|
|
3624
3647
|
global_user_state.add_cluster_event(
|
|
@@ -3647,7 +3670,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3647
3670
|
logger.error(
|
|
3648
3671
|
ux_utils.error_message(
|
|
3649
3672
|
'Failed to provision resources. '
|
|
3650
|
-
f'{ux_utils.
|
|
3673
|
+
f'{ux_utils.provision_hint(cluster_name)}'))
|
|
3651
3674
|
error_message += (
|
|
3652
3675
|
'\nTo keep retrying until the cluster is up, use '
|
|
3653
3676
|
'the `--retry-until-up` flag.')
|
|
@@ -3706,6 +3729,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3706
3729
|
# manually or by the cloud provider.
|
|
3707
3730
|
# Optimize the case where the cluster's IPs can be retrieved
|
|
3708
3731
|
# from cluster_info.
|
|
3732
|
+
handle.cached_cluster_info = cluster_info
|
|
3709
3733
|
handle.docker_user = cluster_info.docker_user
|
|
3710
3734
|
handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
|
|
3711
3735
|
cluster_info=cluster_info)
|
|
@@ -3717,7 +3741,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3717
3741
|
|
|
3718
3742
|
self._update_after_cluster_provisioned(
|
|
3719
3743
|
handle, to_provision_config.prev_handle, task,
|
|
3720
|
-
prev_cluster_status,
|
|
3744
|
+
prev_cluster_status, config_hash)
|
|
3721
3745
|
return handle, False
|
|
3722
3746
|
|
|
3723
3747
|
cluster_config_file = config_dict['ray']
|
|
@@ -3789,7 +3813,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3789
3813
|
|
|
3790
3814
|
self._update_after_cluster_provisioned(
|
|
3791
3815
|
handle, to_provision_config.prev_handle, task,
|
|
3792
|
-
prev_cluster_status,
|
|
3816
|
+
prev_cluster_status, config_hash)
|
|
3793
3817
|
return handle, False
|
|
3794
3818
|
|
|
3795
3819
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -3807,7 +3831,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3807
3831
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3808
3832
|
task: task_lib.Task,
|
|
3809
3833
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3810
|
-
|
|
3834
|
+
config_hash: str) -> None:
|
|
3811
3835
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3812
3836
|
handle.launched_nodes, handle.launched_resources)
|
|
3813
3837
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3919,8 +3943,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3919
3943
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3920
3944
|
handle.ssh_user)
|
|
3921
3945
|
|
|
3922
|
-
locks.get_lock(lock_id).force_unlock()
|
|
3923
|
-
|
|
3924
3946
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3925
3947
|
workdir: Union[Path, Dict[str, Any]],
|
|
3926
3948
|
envs_and_secrets: Dict[str, str]) -> None:
|
|
@@ -4215,6 +4237,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4215
4237
|
codegen: str,
|
|
4216
4238
|
job_id: int,
|
|
4217
4239
|
managed_job_dag: Optional['dag.Dag'] = None,
|
|
4240
|
+
managed_job_user_id: Optional[str] = None,
|
|
4218
4241
|
remote_log_dir: Optional[str] = None,
|
|
4219
4242
|
) -> None:
|
|
4220
4243
|
"""Executes generated code on the head node."""
|
|
@@ -4287,7 +4310,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4287
4310
|
pool=managed_job_dag.pool,
|
|
4288
4311
|
workspace=workspace,
|
|
4289
4312
|
entrypoint=entrypoint,
|
|
4290
|
-
tasks=managed_job_tasks
|
|
4313
|
+
tasks=managed_job_tasks,
|
|
4314
|
+
user_id=managed_job_user_id)
|
|
4291
4315
|
|
|
4292
4316
|
if _is_command_length_over_limit(codegen):
|
|
4293
4317
|
_dump_code_to_file(codegen)
|
|
@@ -4324,7 +4348,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4324
4348
|
managed_job_dag,
|
|
4325
4349
|
skypilot_config.get_active_workspace(
|
|
4326
4350
|
force_user_workspace=True),
|
|
4327
|
-
entrypoint=common_utils.get_current_command()
|
|
4351
|
+
entrypoint=common_utils.get_current_command(),
|
|
4352
|
+
user_hash=managed_job_user_id)
|
|
4328
4353
|
# Set the managed job to PENDING state to make sure that
|
|
4329
4354
|
# this managed job appears in the `sky jobs queue`, even
|
|
4330
4355
|
# if it needs to wait to be submitted.
|
|
@@ -5114,6 +5139,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5114
5139
|
Raises:
|
|
5115
5140
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
|
5116
5141
|
"""
|
|
5142
|
+
try:
|
|
5143
|
+
handle.close_skylet_ssh_tunnel()
|
|
5144
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5145
|
+
# Not critical to the cluster teardown, just log a warning.
|
|
5146
|
+
logger.warning(
|
|
5147
|
+
'Failed to close Skylet SSH tunnel for cluster '
|
|
5148
|
+
f'{handle.cluster_name}: '
|
|
5149
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
5150
|
+
|
|
5117
5151
|
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
|
5118
5152
|
# We have to kill the cluster requests again within the lock, because
|
|
5119
5153
|
# any pending requests on the same cluster should be cancelled after
|
|
@@ -5150,7 +5184,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5150
5184
|
# observed in AWS. See also
|
|
5151
5185
|
# _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
|
|
5152
5186
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5153
|
-
cluster_lock_already_held=True
|
|
5187
|
+
cluster_lock_already_held=True,
|
|
5188
|
+
retry_if_missing=False))
|
|
5154
5189
|
cluster_status_fetched = True
|
|
5155
5190
|
except exceptions.ClusterStatusFetchingError:
|
|
5156
5191
|
logger.warning(
|
|
@@ -6269,6 +6304,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
6269
6304
|
env_vars.update(self._skypilot_predefined_env_vars(handle))
|
|
6270
6305
|
return env_vars
|
|
6271
6306
|
|
|
6307
|
+
def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
|
|
6308
|
+
"""Returns the user id for the managed job."""
|
|
6309
|
+
if task.managed_job_dag is not None:
|
|
6310
|
+
return task.envs[constants.USER_ID_ENV_VAR]
|
|
6311
|
+
return None
|
|
6312
|
+
|
|
6272
6313
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
6273
6314
|
task: task_lib.Task, job_id: int,
|
|
6274
6315
|
remote_log_dir: str) -> None:
|
|
@@ -6307,11 +6348,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
6307
6348
|
|
|
6308
6349
|
codegen.add_epilogue()
|
|
6309
6350
|
|
|
6310
|
-
self._exec_code_on_head(
|
|
6311
|
-
|
|
6312
|
-
|
|
6313
|
-
|
|
6314
|
-
|
|
6351
|
+
self._exec_code_on_head(
|
|
6352
|
+
handle,
|
|
6353
|
+
codegen.build(),
|
|
6354
|
+
job_id,
|
|
6355
|
+
managed_job_dag=task.managed_job_dag,
|
|
6356
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
6357
|
+
remote_log_dir=remote_log_dir)
|
|
6315
6358
|
|
|
6316
6359
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
|
6317
6360
|
task: task_lib.Task, job_id: int,
|
|
@@ -6362,8 +6405,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
6362
6405
|
|
|
6363
6406
|
codegen.add_epilogue()
|
|
6364
6407
|
# TODO(zhanghao): Add help info for downloading logs.
|
|
6365
|
-
self._exec_code_on_head(
|
|
6366
|
-
|
|
6367
|
-
|
|
6368
|
-
|
|
6369
|
-
|
|
6408
|
+
self._exec_code_on_head(
|
|
6409
|
+
handle,
|
|
6410
|
+
codegen.build(),
|
|
6411
|
+
job_id,
|
|
6412
|
+
managed_job_dag=task.managed_job_dag,
|
|
6413
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
6414
|
+
remote_log_dir=remote_log_dir)
|