skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
@@ -16,15 +16,20 @@ import typing
|
|
16
16
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
17
17
|
import uuid
|
18
18
|
|
19
|
-
from sky import
|
20
|
-
from sky import
|
19
|
+
from sky import models
|
20
|
+
from sky import sky_logging
|
21
21
|
from sky.utils import common_utils
|
22
22
|
from sky.utils import db_utils
|
23
|
+
from sky.utils import registry
|
24
|
+
from sky.utils import status_lib
|
23
25
|
|
24
26
|
if typing.TYPE_CHECKING:
|
25
27
|
from sky import backends
|
28
|
+
from sky import clouds
|
26
29
|
from sky.data import Storage
|
27
30
|
|
31
|
+
logger = sky_logging.init_logger(__name__)
|
32
|
+
|
28
33
|
_ENABLED_CLOUDS_KEY = 'enabled_clouds'
|
29
34
|
|
30
35
|
_DB_PATH = os.path.expanduser('~/.sky/state.db')
|
@@ -55,12 +60,15 @@ def create_table(cursor, conn):
|
|
55
60
|
last_use TEXT,
|
56
61
|
status TEXT,
|
57
62
|
autostop INTEGER DEFAULT -1,
|
58
|
-
metadata TEXT DEFAULT
|
63
|
+
metadata TEXT DEFAULT '{}',
|
59
64
|
to_down INTEGER DEFAULT 0,
|
60
65
|
owner TEXT DEFAULT null,
|
61
66
|
cluster_hash TEXT DEFAULT null,
|
62
67
|
storage_mounts_metadata BLOB DEFAULT null,
|
63
|
-
cluster_ever_up INTEGER DEFAULT 0
|
68
|
+
cluster_ever_up INTEGER DEFAULT 0,
|
69
|
+
status_updated_at INTEGER DEFAULT null,
|
70
|
+
config_hash TEXT DEFAULT null,
|
71
|
+
user_hash TEXT DEFAULT null)""")
|
64
72
|
|
65
73
|
# Table for Cluster History
|
66
74
|
# usage_intervals: List[Tuple[int, int]]
|
@@ -83,7 +91,8 @@ def create_table(cursor, conn):
|
|
83
91
|
num_nodes int,
|
84
92
|
requested_resources BLOB,
|
85
93
|
launched_resources BLOB,
|
86
|
-
usage_intervals BLOB
|
94
|
+
usage_intervals BLOB,
|
95
|
+
user_hash TEXT)""")
|
87
96
|
# Table for configs (e.g. enabled clouds)
|
88
97
|
cursor.execute("""\
|
89
98
|
CREATE TABLE IF NOT EXISTS config (
|
@@ -96,6 +105,11 @@ def create_table(cursor, conn):
|
|
96
105
|
handle BLOB,
|
97
106
|
last_use TEXT,
|
98
107
|
status TEXT)""")
|
108
|
+
# Table for User
|
109
|
+
cursor.execute("""\
|
110
|
+
CREATE TABLE IF NOT EXISTS users (
|
111
|
+
id TEXT PRIMARY KEY,
|
112
|
+
name TEXT)""")
|
99
113
|
# For backward compatibility.
|
100
114
|
# TODO(zhwu): Remove this function after all users have migrated to
|
101
115
|
# the latest version of SkyPilot.
|
@@ -104,11 +118,12 @@ def create_table(cursor, conn):
|
|
104
118
|
'INTEGER DEFAULT -1')
|
105
119
|
|
106
120
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'metadata',
|
107
|
-
'TEXT DEFAULT
|
121
|
+
'TEXT DEFAULT \'{}\'')
|
108
122
|
|
109
123
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down',
|
110
124
|
'INTEGER DEFAULT 0')
|
111
125
|
|
126
|
+
# The cloud identity that created the cluster.
|
112
127
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'owner', 'TEXT')
|
113
128
|
|
114
129
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash',
|
@@ -130,17 +145,52 @@ def create_table(cursor, conn):
|
|
130
145
|
# clusters were never really UP, setting it to 1 means they won't be
|
131
146
|
# auto-deleted during any failover.
|
132
147
|
value_to_replace_existing_entries=1)
|
148
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
|
149
|
+
'INTEGER DEFAULT null')
|
150
|
+
db_utils.add_column_to_table(
|
151
|
+
cursor,
|
152
|
+
conn,
|
153
|
+
'clusters',
|
154
|
+
'user_hash',
|
155
|
+
'TEXT DEFAULT null',
|
156
|
+
value_to_replace_existing_entries=common_utils.get_user_hash())
|
157
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
158
|
+
'TEXT DEFAULT null')
|
159
|
+
|
160
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
161
|
+
'TEXT DEFAULT null')
|
162
|
+
|
163
|
+
db_utils.add_column_to_table(cursor, conn, 'cluster_history', 'user_hash',
|
164
|
+
'TEXT DEFAULT null')
|
133
165
|
conn.commit()
|
134
166
|
|
135
167
|
|
136
168
|
_DB = db_utils.SQLiteConn(_DB_PATH, create_table)
|
137
169
|
|
138
170
|
|
171
|
+
def add_or_update_user(user: models.User):
|
172
|
+
"""Store the mapping from user hash to user name for display purposes."""
|
173
|
+
if user.name is None:
|
174
|
+
return
|
175
|
+
_DB.cursor.execute('INSERT OR REPLACE INTO users (id, name) VALUES (?, ?)',
|
176
|
+
(user.id, user.name))
|
177
|
+
_DB.conn.commit()
|
178
|
+
|
179
|
+
|
180
|
+
def get_user(user_id: str) -> models.User:
|
181
|
+
row = _DB.cursor.execute('SELECT id, name FROM users WHERE id=?',
|
182
|
+
(user_id,)).fetchone()
|
183
|
+
if row is None:
|
184
|
+
return models.User(id=user_id)
|
185
|
+
return models.User(id=row[0], name=row[1])
|
186
|
+
|
187
|
+
|
139
188
|
def add_or_update_cluster(cluster_name: str,
|
140
189
|
cluster_handle: 'backends.ResourceHandle',
|
141
190
|
requested_resources: Optional[Set[Any]],
|
142
191
|
ready: bool,
|
143
|
-
is_launch: bool = True
|
192
|
+
is_launch: bool = True,
|
193
|
+
config_hash: Optional[str] = None):
|
144
194
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
145
195
|
|
146
196
|
Args:
|
@@ -155,10 +205,11 @@ def add_or_update_cluster(cluster_name: str,
|
|
155
205
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
156
206
|
handle = pickle.dumps(cluster_handle)
|
157
207
|
cluster_launched_at = int(time.time()) if is_launch else None
|
158
|
-
last_use = common_utils.
|
208
|
+
last_use = common_utils.get_current_command() if is_launch else None
|
159
209
|
status = status_lib.ClusterStatus.INIT
|
160
210
|
if ready:
|
161
211
|
status = status_lib.ClusterStatus.UP
|
212
|
+
status_updated_at = int(time.time())
|
162
213
|
|
163
214
|
# TODO (sumanth): Cluster history table will have multiple entries
|
164
215
|
# when the cluster failover through multiple regions (one entry per region).
|
@@ -183,6 +234,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
183
234
|
cluster_launched_at = int(time.time())
|
184
235
|
usage_intervals.append((cluster_launched_at, None))
|
185
236
|
|
237
|
+
user_hash = common_utils.get_user_hash()
|
238
|
+
|
186
239
|
_DB.cursor.execute(
|
187
240
|
'INSERT or REPLACE INTO clusters'
|
188
241
|
# All the fields need to exist here, even if they don't need
|
@@ -191,7 +244,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
191
244
|
# specified.
|
192
245
|
'(name, launched_at, handle, last_use, status, '
|
193
246
|
'autostop, to_down, metadata, owner, cluster_hash, '
|
194
|
-
'storage_mounts_metadata, cluster_ever_up
|
247
|
+
'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
|
248
|
+
'config_hash, user_hash) '
|
195
249
|
'VALUES ('
|
196
250
|
# name
|
197
251
|
'?, '
|
@@ -217,7 +271,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
217
271
|
# Keep the old metadata value if it exists, otherwise set it to
|
218
272
|
# default {}.
|
219
273
|
'COALESCE('
|
220
|
-
'(SELECT metadata FROM clusters WHERE name=?),
|
274
|
+
'(SELECT metadata FROM clusters WHERE name=?), \'{}\'),'
|
221
275
|
# Keep the old owner value if it exists, otherwise set it to
|
222
276
|
# default null.
|
223
277
|
'COALESCE('
|
@@ -228,7 +282,14 @@ def add_or_update_cluster(cluster_name: str,
|
|
228
282
|
'COALESCE('
|
229
283
|
'(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
|
230
284
|
# cluster_ever_up
|
231
|
-
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
|
285
|
+
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?), '
|
286
|
+
# status_updated_at
|
287
|
+
'?,'
|
288
|
+
# config_hash
|
289
|
+
'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?)),'
|
290
|
+
# user_hash: keep original user_hash if it exists
|
291
|
+
'COALESCE('
|
292
|
+
'(SELECT user_hash FROM clusters WHERE name=?), ?)'
|
232
293
|
')',
|
233
294
|
(
|
234
295
|
# name
|
@@ -260,6 +321,14 @@ def add_or_update_cluster(cluster_name: str,
|
|
260
321
|
# cluster_ever_up
|
261
322
|
cluster_name,
|
262
323
|
int(ready),
|
324
|
+
# status_updated_at
|
325
|
+
status_updated_at,
|
326
|
+
# config_hash
|
327
|
+
config_hash,
|
328
|
+
cluster_name,
|
329
|
+
# user_hash
|
330
|
+
cluster_name,
|
331
|
+
user_hash,
|
263
332
|
))
|
264
333
|
|
265
334
|
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
@@ -267,7 +336,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
267
336
|
_DB.cursor.execute(
|
268
337
|
'INSERT or REPLACE INTO cluster_history'
|
269
338
|
'(cluster_hash, name, num_nodes, requested_resources, '
|
270
|
-
'launched_resources, usage_intervals) '
|
339
|
+
'launched_resources, usage_intervals, user_hash) '
|
271
340
|
'VALUES ('
|
272
341
|
# hash
|
273
342
|
'?, '
|
@@ -280,7 +349,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
280
349
|
# number of nodes
|
281
350
|
'?, '
|
282
351
|
# usage intervals
|
283
|
-
'
|
352
|
+
'?, '
|
353
|
+
# user_hash
|
354
|
+
'?'
|
355
|
+
')',
|
284
356
|
(
|
285
357
|
# hash
|
286
358
|
cluster_hash,
|
@@ -294,15 +366,37 @@ def add_or_update_cluster(cluster_name: str,
|
|
294
366
|
pickle.dumps(launched_resources),
|
295
367
|
# usage intervals
|
296
368
|
pickle.dumps(usage_intervals),
|
369
|
+
# user_hash
|
370
|
+
user_hash,
|
297
371
|
))
|
298
372
|
|
299
373
|
_DB.conn.commit()
|
300
374
|
|
301
375
|
|
376
|
+
def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
377
|
+
"""Returns the user hash or the current user hash, if user_hash is None.
|
378
|
+
|
379
|
+
This is to ensure that the clusters created before the client-server
|
380
|
+
architecture (no user hash info previously) are associated with the current
|
381
|
+
user.
|
382
|
+
"""
|
383
|
+
if user_hash is not None:
|
384
|
+
return user_hash
|
385
|
+
return common_utils.get_user_hash()
|
386
|
+
|
387
|
+
|
388
|
+
def update_cluster_handle(cluster_name: str,
|
389
|
+
cluster_handle: 'backends.ResourceHandle'):
|
390
|
+
handle = pickle.dumps(cluster_handle)
|
391
|
+
_DB.cursor.execute('UPDATE clusters SET handle=(?) WHERE name=(?)',
|
392
|
+
(handle, cluster_name))
|
393
|
+
_DB.conn.commit()
|
394
|
+
|
395
|
+
|
302
396
|
def update_last_use(cluster_name: str):
|
303
397
|
"""Updates the last used command for the cluster."""
|
304
398
|
_DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)',
|
305
|
-
(common_utils.
|
399
|
+
(common_utils.get_current_command(), cluster_name))
|
306
400
|
_DB.conn.commit()
|
307
401
|
|
308
402
|
|
@@ -330,11 +424,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
330
424
|
# stopped VM, which leads to timeout.
|
331
425
|
if hasattr(handle, 'stable_internal_external_ips'):
|
332
426
|
handle.stable_internal_external_ips = None
|
427
|
+
current_time = int(time.time())
|
333
428
|
_DB.cursor.execute(
|
334
|
-
'UPDATE clusters SET handle=(?), status=(?) '
|
335
|
-
'WHERE name=(?)', (
|
429
|
+
'UPDATE clusters SET handle=(?), status=(?), '
|
430
|
+
'status_updated_at=(?) WHERE name=(?)', (
|
336
431
|
pickle.dumps(handle),
|
337
432
|
status_lib.ClusterStatus.STOPPED.value,
|
433
|
+
current_time,
|
338
434
|
cluster_name,
|
339
435
|
))
|
340
436
|
_DB.conn.commit()
|
@@ -359,10 +455,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
|
|
359
455
|
|
360
456
|
def set_cluster_status(cluster_name: str,
|
361
457
|
status: status_lib.ClusterStatus) -> None:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
458
|
+
current_time = int(time.time())
|
459
|
+
_DB.cursor.execute(
|
460
|
+
'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
|
461
|
+
(status.value, current_time, cluster_name))
|
366
462
|
count = _DB.cursor.rowcount
|
367
463
|
_DB.conn.commit()
|
368
464
|
assert count <= 1, count
|
@@ -570,15 +666,19 @@ def _load_storage_mounts_metadata(
|
|
570
666
|
|
571
667
|
def get_cluster_from_name(
|
572
668
|
cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
|
573
|
-
rows = _DB.cursor.execute(
|
574
|
-
|
669
|
+
rows = _DB.cursor.execute(
|
670
|
+
'SELECT name, launched_at, handle, last_use, status, autostop, '
|
671
|
+
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
672
|
+
'cluster_ever_up, status_updated_at, config_hash, user_hash '
|
673
|
+
'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
|
575
674
|
for row in rows:
|
576
675
|
# Explicitly specify the number of fields to unpack, so that
|
577
676
|
# we can add new fields to the database in the future without
|
578
677
|
# breaking the previous code.
|
579
678
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
580
|
-
to_down, owner, cluster_hash, storage_mounts_metadata,
|
581
|
-
|
679
|
+
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
680
|
+
status_updated_at, config_hash, user_hash) = row
|
681
|
+
user_hash = _get_user_hash_or_current_user(user_hash)
|
582
682
|
# TODO: use namedtuple instead of dict
|
583
683
|
record = {
|
584
684
|
'name': name,
|
@@ -594,6 +694,10 @@ def get_cluster_from_name(
|
|
594
694
|
'storage_mounts_metadata':
|
595
695
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
596
696
|
'cluster_ever_up': bool(cluster_ever_up),
|
697
|
+
'status_updated_at': status_updated_at,
|
698
|
+
'user_hash': user_hash,
|
699
|
+
'user_name': get_user(user_hash).name,
|
700
|
+
'config_hash': config_hash,
|
597
701
|
}
|
598
702
|
return record
|
599
703
|
return None
|
@@ -601,12 +705,16 @@ def get_cluster_from_name(
|
|
601
705
|
|
602
706
|
def get_clusters() -> List[Dict[str, Any]]:
|
603
707
|
rows = _DB.cursor.execute(
|
604
|
-
'select
|
708
|
+
'select name, launched_at, handle, last_use, status, autostop, '
|
709
|
+
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
710
|
+
'cluster_ever_up, status_updated_at, config_hash, user_hash '
|
711
|
+
'from clusters order by launched_at desc').fetchall()
|
605
712
|
records = []
|
606
713
|
for row in rows:
|
607
714
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
608
|
-
to_down, owner, cluster_hash, storage_mounts_metadata,
|
609
|
-
|
715
|
+
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
716
|
+
status_updated_at, config_hash, user_hash) = row
|
717
|
+
user_hash = _get_user_hash_or_current_user(user_hash)
|
610
718
|
# TODO: use namedtuple instead of dict
|
611
719
|
record = {
|
612
720
|
'name': name,
|
@@ -622,6 +730,10 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
622
730
|
'storage_mounts_metadata':
|
623
731
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
624
732
|
'cluster_ever_up': bool(cluster_ever_up),
|
733
|
+
'status_updated_at': status_updated_at,
|
734
|
+
'user_hash': user_hash,
|
735
|
+
'user_name': get_user(user_hash).name,
|
736
|
+
'config_hash': config_hash,
|
625
737
|
}
|
626
738
|
|
627
739
|
records.append(record)
|
@@ -631,7 +743,8 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
631
743
|
def get_clusters_from_history() -> List[Dict[str, Any]]:
|
632
744
|
rows = _DB.cursor.execute(
|
633
745
|
'SELECT ch.cluster_hash, ch.name, ch.num_nodes, '
|
634
|
-
'ch.launched_resources, ch.usage_intervals, clusters.status
|
746
|
+
'ch.launched_resources, ch.usage_intervals, clusters.status, '
|
747
|
+
'ch.user_hash '
|
635
748
|
'FROM cluster_history ch '
|
636
749
|
'LEFT OUTER JOIN clusters '
|
637
750
|
'ON ch.cluster_hash=clusters.cluster_hash ').fetchall()
|
@@ -650,7 +763,9 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
|
|
650
763
|
launched_resources,
|
651
764
|
usage_intervals,
|
652
765
|
status,
|
653
|
-
|
766
|
+
user_hash,
|
767
|
+
) = row[:7]
|
768
|
+
user_hash = _get_user_hash_or_current_user(user_hash)
|
654
769
|
|
655
770
|
if status is not None:
|
656
771
|
status = status_lib.ClusterStatus[status]
|
@@ -664,6 +779,7 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
|
|
664
779
|
'cluster_hash': cluster_hash,
|
665
780
|
'usage_intervals': pickle.loads(usage_intervals),
|
666
781
|
'status': status,
|
782
|
+
'user_hash': user_hash,
|
667
783
|
}
|
668
784
|
|
669
785
|
records.append(record)
|
@@ -679,17 +795,17 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
|
679
795
|
return [row[0] for row in rows]
|
680
796
|
|
681
797
|
|
682
|
-
def get_cached_enabled_clouds() -> List[clouds.Cloud]:
|
798
|
+
def get_cached_enabled_clouds() -> List['clouds.Cloud']:
|
683
799
|
rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
|
684
800
|
(_ENABLED_CLOUDS_KEY,))
|
685
801
|
ret = []
|
686
802
|
for (value,) in rows:
|
687
803
|
ret = json.loads(value)
|
688
804
|
break
|
689
|
-
enabled_clouds: List[clouds.Cloud] = []
|
805
|
+
enabled_clouds: List['clouds.Cloud'] = []
|
690
806
|
for c in ret:
|
691
807
|
try:
|
692
|
-
cloud =
|
808
|
+
cloud = registry.CLOUD_REGISTRY.from_str(c)
|
693
809
|
except ValueError:
|
694
810
|
# Handle the case for the clouds whose support has been removed from
|
695
811
|
# SkyPilot, e.g., 'local' was a cloud in the past and may be stored
|
@@ -712,7 +828,7 @@ def add_or_update_storage(storage_name: str,
|
|
712
828
|
storage_status: status_lib.StorageStatus):
|
713
829
|
storage_launched_at = int(time.time())
|
714
830
|
handle = pickle.dumps(storage_handle)
|
715
|
-
last_use = common_utils.
|
831
|
+
last_use = common_utils.get_current_command()
|
716
832
|
|
717
833
|
def status_check(status):
|
718
834
|
return status in status_lib.StorageStatus
|
@@ -794,7 +910,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
|
|
794
910
|
|
795
911
|
|
796
912
|
def get_storage() -> List[Dict[str, Any]]:
|
797
|
-
rows = _DB.cursor.execute('
|
913
|
+
rows = _DB.cursor.execute('SELECT * FROM storage')
|
798
914
|
records = []
|
799
915
|
for name, launched_at, handle, last_use, status in rows:
|
800
916
|
# TODO: use namedtuple instead of dict
|
sky/jobs/__init__.py
CHANGED
@@ -1,33 +1,32 @@
|
|
1
1
|
"""Managed jobs."""
|
2
2
|
import pathlib
|
3
3
|
|
4
|
+
from sky.jobs.client.sdk import cancel
|
5
|
+
from sky.jobs.client.sdk import dashboard
|
6
|
+
from sky.jobs.client.sdk import download_logs
|
7
|
+
from sky.jobs.client.sdk import launch
|
8
|
+
from sky.jobs.client.sdk import queue
|
9
|
+
from sky.jobs.client.sdk import tail_logs
|
4
10
|
from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
|
11
|
+
from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
|
5
12
|
from sky.jobs.constants import JOBS_CONTROLLER_TEMPLATE
|
6
13
|
from sky.jobs.constants import JOBS_CONTROLLER_YAML_PREFIX
|
7
14
|
from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
|
8
|
-
from sky.jobs.
|
9
|
-
from sky.jobs.core import launch
|
10
|
-
from sky.jobs.core import queue
|
11
|
-
from sky.jobs.core import tail_logs
|
12
|
-
from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
|
13
|
-
from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
|
15
|
+
from sky.jobs.recovery_strategy import StrategyExecutor
|
14
16
|
from sky.jobs.state import ManagedJobStatus
|
15
17
|
from sky.jobs.utils import dump_managed_job_queue
|
16
18
|
from sky.jobs.utils import format_job_table
|
17
|
-
from sky.jobs.utils import JOB_CONTROLLER_NAME
|
18
19
|
from sky.jobs.utils import load_managed_job_queue
|
19
20
|
from sky.jobs.utils import ManagedJobCodeGen
|
20
21
|
|
21
22
|
pathlib.Path(JOBS_TASK_YAML_PREFIX).expanduser().parent.mkdir(parents=True,
|
22
23
|
exist_ok=True)
|
23
24
|
__all__ = [
|
24
|
-
'RECOVERY_STRATEGIES',
|
25
|
-
'DEFAULT_RECOVERY_STRATEGY',
|
26
|
-
'JOB_CONTROLLER_NAME',
|
27
25
|
# Constants
|
28
26
|
'JOBS_CONTROLLER_TEMPLATE',
|
29
27
|
'JOBS_CONTROLLER_YAML_PREFIX',
|
30
28
|
'JOBS_TASK_YAML_PREFIX',
|
29
|
+
'JOBS_CONTROLLER_LOGS_DIR',
|
31
30
|
# Enums
|
32
31
|
'ManagedJobStatus',
|
33
32
|
# Core
|
@@ -35,9 +34,12 @@ __all__ = [
|
|
35
34
|
'launch',
|
36
35
|
'queue',
|
37
36
|
'tail_logs',
|
37
|
+
'dashboard',
|
38
|
+
'download_logs',
|
38
39
|
# utils
|
39
40
|
'ManagedJobCodeGen',
|
40
41
|
'format_job_table',
|
41
42
|
'dump_managed_job_queue',
|
42
43
|
'load_managed_job_queue',
|
44
|
+
'StrategyExecutor',
|
43
45
|
]
|
File without changes
|