skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/logs/agent.py
CHANGED
|
@@ -38,7 +38,7 @@ class FluentbitAgent(LoggingAgent):
|
|
|
38
38
|
'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
|
|
39
39
|
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
40
40
|
# pylint: disable=line-too-long
|
|
41
|
-
'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
41
|
+
'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
42
42
|
# pylint: disable=line-too-long
|
|
43
43
|
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
44
44
|
# pylint: disable=line-too-long
|
sky/metrics/utils.py
CHANGED
|
@@ -48,8 +48,15 @@ SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
|
|
|
48
48
|
'sky_apiserver_code_duration_seconds',
|
|
49
49
|
'Time spent processing code',
|
|
50
50
|
['name', 'group'],
|
|
51
|
-
buckets=(0.
|
|
52
|
-
|
|
51
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
52
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
53
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
54
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
55
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
56
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
57
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
58
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
59
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
53
60
|
)
|
|
54
61
|
|
|
55
62
|
# Total number of API server requests, grouped by path, method, and status.
|
|
@@ -65,16 +72,30 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
|
|
|
65
72
|
'sky_apiserver_request_duration_seconds',
|
|
66
73
|
'Time spent processing API server requests',
|
|
67
74
|
['path', 'method', 'status'],
|
|
68
|
-
buckets=(0.
|
|
69
|
-
|
|
75
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
76
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
77
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
78
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
79
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
80
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
81
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
82
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
83
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
70
84
|
)
|
|
71
85
|
|
|
72
86
|
SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
|
|
73
87
|
'sky_apiserver_event_loop_lag_seconds',
|
|
74
88
|
'Scheduling delay of the server event loop',
|
|
75
89
|
['pid'],
|
|
76
|
-
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.
|
|
77
|
-
|
|
90
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
91
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
92
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
93
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
94
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
95
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
96
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
97
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
98
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
78
99
|
)
|
|
79
100
|
|
|
80
101
|
SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
|
|
@@ -122,6 +143,24 @@ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
|
122
143
|
'RSS increment after requests', ['name'],
|
|
123
144
|
buckets=_MEM_BUCKETS)
|
|
124
145
|
|
|
146
|
+
SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
|
|
147
|
+
'sky_apiserver_websocket_ssh_latency_seconds',
|
|
148
|
+
('Time taken for ssh message to go from client to API server and back'
|
|
149
|
+
'to the client. This does not include: latency to reach the pod, '
|
|
150
|
+
'overhead from sending through the k8s port-forward tunnel, or '
|
|
151
|
+
'ssh server lag on the destination pod.'),
|
|
152
|
+
['pid'],
|
|
153
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
|
|
154
|
+
0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
|
|
155
|
+
5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
|
|
156
|
+
50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
|
|
157
|
+
240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
|
|
158
|
+
420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
|
|
159
|
+
600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
|
|
160
|
+
780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
|
|
161
|
+
960.0, 980.0, 1000.0, float('inf')),
|
|
162
|
+
)
|
|
163
|
+
|
|
125
164
|
|
|
126
165
|
@contextlib.contextmanager
|
|
127
166
|
def time_it(name: str, group: str = 'default'):
|
sky/optimizer.py
CHANGED
|
@@ -1019,7 +1019,7 @@ class Optimizer:
|
|
|
1019
1019
|
if res.instance_type is not None
|
|
1020
1020
|
])
|
|
1021
1021
|
candidate_str = resources_utils.format_resource(
|
|
1022
|
-
best_resources,
|
|
1022
|
+
best_resources, simplified_only=True)[0]
|
|
1023
1023
|
|
|
1024
1024
|
logger.info(
|
|
1025
1025
|
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
sky/provision/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ from sky.provision import primeintellect
|
|
|
28
28
|
from sky.provision import runpod
|
|
29
29
|
from sky.provision import scp
|
|
30
30
|
from sky.provision import seeweb
|
|
31
|
+
from sky.provision import shadeform
|
|
31
32
|
from sky.provision import ssh
|
|
32
33
|
from sky.provision import vast
|
|
33
34
|
from sky.provision import vsphere
|
|
@@ -79,6 +80,7 @@ def query_instances(
|
|
|
79
80
|
cluster_name_on_cloud: str,
|
|
80
81
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
81
82
|
non_terminated_only: bool = True,
|
|
83
|
+
retry_if_missing: bool = False,
|
|
82
84
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
83
85
|
"""Query instances.
|
|
84
86
|
|
|
@@ -87,6 +89,11 @@ def query_instances(
|
|
|
87
89
|
|
|
88
90
|
A None status means the instance is marked as "terminated"
|
|
89
91
|
or "terminating".
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
95
|
+
cluster is not found when querying the live status on the cloud.
|
|
96
|
+
NOTE: This is currently only used on kubernetes.
|
|
90
97
|
"""
|
|
91
98
|
raise NotImplementedError
|
|
92
99
|
|
sky/provision/aws/instance.py
CHANGED
|
@@ -630,9 +630,10 @@ def query_instances(
|
|
|
630
630
|
cluster_name_on_cloud: str,
|
|
631
631
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
632
632
|
non_terminated_only: bool = True,
|
|
633
|
+
retry_if_missing: bool = False,
|
|
633
634
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
634
635
|
"""See sky/provision/__init__.py"""
|
|
635
|
-
del cluster_name # unused
|
|
636
|
+
del cluster_name, retry_if_missing # unused
|
|
636
637
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
637
638
|
region = provider_config['region']
|
|
638
639
|
ec2 = _default_ec2_resource(region)
|
sky/provision/azure/instance.py
CHANGED
|
@@ -957,9 +957,10 @@ def query_instances(
|
|
|
957
957
|
cluster_name_on_cloud: str,
|
|
958
958
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
959
959
|
non_terminated_only: bool = True,
|
|
960
|
+
retry_if_missing: bool = False,
|
|
960
961
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
961
962
|
"""See sky/provision/__init__.py"""
|
|
962
|
-
del cluster_name # unused
|
|
963
|
+
del cluster_name, retry_if_missing # unused
|
|
963
964
|
assert provider_config is not None, cluster_name_on_cloud
|
|
964
965
|
|
|
965
966
|
subscription_id = provider_config['subscription_id']
|
sky/provision/common.py
CHANGED
|
@@ -97,6 +97,8 @@ class InstanceInfo:
|
|
|
97
97
|
external_ip: Optional[str]
|
|
98
98
|
tags: Dict[str, str]
|
|
99
99
|
ssh_port: int = 22
|
|
100
|
+
# The internal service address of the instance on Kubernetes.
|
|
101
|
+
internal_svc: Optional[str] = None
|
|
100
102
|
|
|
101
103
|
def get_feasible_ip(self) -> str:
|
|
102
104
|
"""Get the most feasible IPs of the instance. This function returns
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -195,9 +195,10 @@ def query_instances(
|
|
|
195
195
|
cluster_name_on_cloud: str,
|
|
196
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
197
197
|
non_terminated_only: bool = True,
|
|
198
|
+
retry_if_missing: bool = False,
|
|
198
199
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
199
200
|
"""See sky/provision/__init__.py"""
|
|
200
|
-
del cluster_name # unused
|
|
201
|
+
del cluster_name, retry_if_missing # unused
|
|
201
202
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
202
203
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
203
204
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -246,9 +246,10 @@ def query_instances(
|
|
|
246
246
|
cluster_name_on_cloud: str,
|
|
247
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
248
248
|
non_terminated_only: bool = True,
|
|
249
|
+
retry_if_missing: bool = False,
|
|
249
250
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
250
251
|
"""See sky/provision/__init__.py"""
|
|
251
|
-
del cluster_name # unused
|
|
252
|
+
del cluster_name, retry_if_missing # unused
|
|
252
253
|
# terminated instances are not retrieved by the
|
|
253
254
|
# API making `non_terminated_only` argument moot.
|
|
254
255
|
del non_terminated_only
|
|
@@ -3,11 +3,11 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
from sky import authentication as auth
|
|
7
6
|
from sky import exceptions
|
|
8
7
|
from sky import sky_logging
|
|
9
8
|
from sky.provision import common
|
|
10
9
|
from sky.provision.fluidstack import fluidstack_utils as utils
|
|
10
|
+
from sky.utils import auth_utils
|
|
11
11
|
from sky.utils import command_runner
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
@@ -27,7 +27,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
27
27
|
def get_internal_ip(node_info: Dict[str, Any]) -> None:
|
|
28
28
|
node_info['internal_ip'] = node_info['ip_address']
|
|
29
29
|
|
|
30
|
-
private_key_path, _ =
|
|
30
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
31
31
|
runner = command_runner.SSHCommandRunner(
|
|
32
32
|
(node_info['ip_address'], 22),
|
|
33
33
|
ssh_user='ubuntu',
|
|
@@ -291,9 +291,10 @@ def query_instances(
|
|
|
291
291
|
cluster_name_on_cloud: str,
|
|
292
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
293
293
|
non_terminated_only: bool = True,
|
|
294
|
+
retry_if_missing: bool = False,
|
|
294
295
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
295
296
|
"""See sky/provision/__init__.py"""
|
|
296
|
-
del cluster_name # unused
|
|
297
|
+
del cluster_name, retry_if_missing # unused
|
|
297
298
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
298
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
299
300
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -62,9 +62,10 @@ def query_instances(
|
|
|
62
62
|
cluster_name_on_cloud: str,
|
|
63
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
64
64
|
non_terminated_only: bool = True,
|
|
65
|
+
retry_if_missing: bool = False,
|
|
65
66
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
66
67
|
"""See sky/provision/__init__.py"""
|
|
67
|
-
del cluster_name # unused
|
|
68
|
+
del cluster_name, retry_if_missing # unused
|
|
68
69
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
69
70
|
zone = provider_config['availability_zone']
|
|
70
71
|
project_id = provider_config['project_id']
|
|
@@ -309,9 +309,10 @@ def query_instances(
|
|
|
309
309
|
cluster_name_on_cloud: str,
|
|
310
310
|
provider_config: Optional[dict] = None,
|
|
311
311
|
non_terminated_only: bool = True,
|
|
312
|
+
retry_if_missing: bool = False,
|
|
312
313
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
313
314
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
314
|
-
del cluster_name, provider_config # unused
|
|
315
|
+
del cluster_name, provider_config, retry_if_missing # unused
|
|
315
316
|
# Fetch all instances for this cluster
|
|
316
317
|
instances = utils.list_instances(
|
|
317
318
|
metadata={'skypilot': {
|
sky/provision/instance_setup.py
CHANGED
|
@@ -434,8 +434,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
|
|
|
434
434
|
# use the external IP of the head node.
|
|
435
435
|
use_external_ip = cluster_info.custom_ray_options.pop(
|
|
436
436
|
'use_external_ip', False)
|
|
437
|
-
|
|
438
|
-
|
|
437
|
+
|
|
438
|
+
if use_external_ip:
|
|
439
|
+
head_ip = head_instance.external_ip
|
|
440
|
+
else:
|
|
441
|
+
# For Kubernetes, use the internal service address of the head node.
|
|
442
|
+
# Keep this consistent with the logic in kubernetes-ray.yml.j2
|
|
443
|
+
if head_instance.internal_svc:
|
|
444
|
+
head_ip = head_instance.internal_svc
|
|
445
|
+
else:
|
|
446
|
+
head_ip = head_instance.internal_ip
|
|
439
447
|
|
|
440
448
|
ray_cmd = ray_worker_start_command(custom_resource,
|
|
441
449
|
cluster_info.custom_ray_options,
|
|
@@ -18,7 +18,6 @@ SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
|
|
|
18
18
|
|
|
19
19
|
# Labels for the Pods created by SkyPilot
|
|
20
20
|
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
|
21
|
-
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
|
22
21
|
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
|
23
22
|
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
|
24
23
|
|