skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +48 -22
- sky/adaptors/aws.py +2 -1
- sky/adaptors/azure.py +4 -4
- sky/adaptors/cloudflare.py +4 -4
- sky/adaptors/kubernetes.py +8 -8
- sky/authentication.py +42 -45
- sky/backends/backend.py +2 -2
- sky/backends/backend_utils.py +108 -221
- sky/backends/cloud_vm_ray_backend.py +283 -282
- sky/benchmark/benchmark_utils.py +6 -2
- sky/check.py +40 -28
- sky/cli.py +1213 -1116
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5644 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1757 -0
- sky/cloud_stores.py +12 -6
- sky/clouds/__init__.py +0 -2
- sky/clouds/aws.py +20 -13
- sky/clouds/azure.py +5 -3
- sky/clouds/cloud.py +1 -1
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +10 -8
- sky/clouds/ibm.py +8 -7
- sky/clouds/kubernetes.py +7 -6
- sky/clouds/lambda_cloud.py +8 -7
- sky/clouds/oci.py +4 -3
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +8 -7
- sky/clouds/service_catalog/__init__.py +3 -3
- sky/clouds/service_catalog/aws_catalog.py +7 -1
- sky/clouds/service_catalog/common.py +4 -2
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +263 -99
- sky/dag.py +4 -0
- sky/data/mounting_utils.py +2 -1
- sky/data/storage.py +97 -35
- sky/data/storage_utils.py +69 -9
- sky/exceptions.py +138 -5
- sky/execution.py +47 -50
- sky/global_user_state.py +105 -22
- sky/jobs/__init__.py +12 -14
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +296 -0
- sky/jobs/constants.py +30 -1
- sky/jobs/controller.py +12 -6
- sky/jobs/dashboard/dashboard.py +2 -6
- sky/jobs/recovery_strategy.py +22 -29
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/{core.py → server/core.py} +101 -34
- sky/jobs/server/dashboard_utils.py +64 -0
- sky/jobs/server/server.py +182 -0
- sky/jobs/utils.py +32 -23
- sky/models.py +27 -0
- sky/optimizer.py +9 -11
- sky/provision/__init__.py +6 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/aws/instance.py +1 -1
- sky/provision/azure/instance.py +1 -1
- sky/provision/cudo/instance.py +1 -1
- sky/provision/do/instance.py +1 -1
- sky/provision/do/utils.py +0 -5
- sky/provision/fluidstack/fluidstack_utils.py +4 -3
- sky/provision/fluidstack/instance.py +4 -2
- sky/provision/gcp/instance.py +1 -1
- sky/provision/instance_setup.py +2 -2
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +67 -76
- sky/provision/lambda_cloud/instance.py +3 -15
- sky/provision/logging.py +1 -1
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +1 -1
- sky/provision/provisioner.py +3 -2
- sky/provision/runpod/instance.py +1 -1
- sky/provision/vast/instance.py +1 -1
- sky/provision/vast/utils.py +2 -1
- sky/provision/vsphere/instance.py +2 -11
- sky/resources.py +55 -40
- sky/serve/__init__.py +6 -10
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +3 -0
- sky/serve/replica_managers.py +10 -10
- sky/serve/serve_utils.py +56 -36
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +37 -17
- sky/serve/server/server.py +117 -0
- sky/serve/service.py +8 -1
- sky/server/__init__.py +1 -0
- sky/server/common.py +441 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +462 -0
- sky/server/requests/payloads.py +481 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1095 -0
- sky/server/stream_utils.py +144 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +12 -4
- sky/setup_files/setup.py +1 -1
- sky/sky_logging.py +9 -13
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +46 -12
- sky/skylet/events.py +5 -6
- sky/skylet/job_lib.py +78 -66
- sky/skylet/log_lib.py +17 -11
- sky/skypilot_config.py +79 -94
- sky/task.py +119 -73
- sky/templates/aws-ray.yml.j2 +4 -4
- sky/templates/azure-ray.yml.j2 +3 -2
- sky/templates/cudo-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +3 -2
- sky/templates/gcp-ray.yml.j2 +3 -2
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +1 -12
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/templates/lambda-ray.yml.j2 +3 -2
- sky/templates/oci-ray.yml.j2 +3 -2
- sky/templates/paperspace-ray.yml.j2 +3 -2
- sky/templates/runpod-ray.yml.j2 +3 -2
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vsphere-ray.yml.j2 +4 -2
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +8 -0
- sky/usage/usage_lib.py +45 -11
- sky/utils/accelerator_registry.py +33 -53
- sky/utils/admin_policy_utils.py +2 -1
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +33 -3
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +69 -14
- sky/utils/common.py +74 -0
- sky/utils/common_utils.py +133 -93
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +2 -3
- sky/utils/controller_utils.py +133 -147
- sky/utils/dag_utils.py +72 -24
- sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/log_utils.py +83 -23
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +2 -2
- sky/utils/rich_utils.py +213 -34
- sky/utils/schemas.py +19 -2
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +51 -35
- sky/utils/timeline.py +7 -2
- sky/utils/ux_utils.py +95 -25
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
- sky/clouds/cloud_registry.py +0 -76
- sky/utils/cluster_yaml_utils.py +0 -24
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/execution.py
CHANGED
@@ -3,28 +3,32 @@
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
4
4
|
"""
|
5
5
|
import enum
|
6
|
+
import typing
|
6
7
|
from typing import List, Optional, Tuple, Union
|
7
8
|
|
8
9
|
import colorama
|
9
10
|
|
10
|
-
import sky
|
11
11
|
from sky import admin_policy
|
12
12
|
from sky import backends
|
13
13
|
from sky import clouds
|
14
14
|
from sky import global_user_state
|
15
15
|
from sky import optimizer
|
16
16
|
from sky import sky_logging
|
17
|
-
from sky import status_lib
|
18
17
|
from sky.backends import backend_utils
|
19
18
|
from sky.usage import usage_lib
|
20
19
|
from sky.utils import admin_policy_utils
|
20
|
+
from sky.utils import common
|
21
21
|
from sky.utils import controller_utils
|
22
22
|
from sky.utils import dag_utils
|
23
23
|
from sky.utils import resources_utils
|
24
24
|
from sky.utils import rich_utils
|
25
|
+
from sky.utils import status_lib
|
25
26
|
from sky.utils import timeline
|
26
27
|
from sky.utils import ux_utils
|
27
28
|
|
29
|
+
if typing.TYPE_CHECKING:
|
30
|
+
import sky
|
31
|
+
|
28
32
|
logger = sky_logging.init_logger(__name__)
|
29
33
|
|
30
34
|
|
@@ -100,7 +104,7 @@ def _execute(
|
|
100
104
|
handle: Optional[backends.ResourceHandle] = None,
|
101
105
|
backend: Optional[backends.Backend] = None,
|
102
106
|
retry_until_up: bool = False,
|
103
|
-
optimize_target:
|
107
|
+
optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
|
104
108
|
stages: Optional[List[Stage]] = None,
|
105
109
|
cluster_name: Optional[str] = None,
|
106
110
|
detach_setup: bool = False,
|
@@ -111,6 +115,7 @@ def _execute(
|
|
111
115
|
skip_unnecessary_provisioning: bool = False,
|
112
116
|
# Internal only:
|
113
117
|
# pylint: disable=invalid-name
|
118
|
+
_quiet_optimizer: bool = False,
|
114
119
|
_is_launched_by_jobs_controller: bool = False,
|
115
120
|
_is_launched_by_sky_serve_controller: bool = False,
|
116
121
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
@@ -167,16 +172,19 @@ def _execute(
|
|
167
172
|
"""
|
168
173
|
|
169
174
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
175
|
+
for task in dag.tasks:
|
176
|
+
if task.storage_mounts is not None:
|
177
|
+
for storage in task.storage_mounts.values():
|
178
|
+
# Ensure the storage is constructed.
|
179
|
+
storage.construct()
|
180
|
+
dag, _ = admin_policy_utils.apply(
|
181
|
+
dag,
|
182
|
+
request_options=admin_policy.RequestOptions(
|
183
|
+
cluster_name=cluster_name,
|
184
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
185
|
+
down=down,
|
186
|
+
dryrun=dryrun,
|
187
|
+
))
|
180
188
|
assert len(dag) == 1, f'We support 1 task for now. {dag}'
|
181
189
|
task = dag.tasks[0]
|
182
190
|
|
@@ -274,14 +282,15 @@ def _execute(
|
|
274
282
|
# no-credential machine should not enter optimize(), which
|
275
283
|
# would directly error out ('No cloud is enabled...'). Fix
|
276
284
|
# by moving `sky check` checks out of optimize()?
|
277
|
-
|
278
285
|
controller = controller_utils.Controllers.from_name(
|
279
286
|
cluster_name)
|
280
287
|
if controller is not None:
|
281
288
|
logger.info(
|
282
289
|
f'Choosing resources for {controller.value.name}...'
|
283
290
|
)
|
284
|
-
dag =
|
291
|
+
dag = optimizer.Optimizer.optimize(dag,
|
292
|
+
minimize=optimize_target,
|
293
|
+
quiet=_quiet_optimizer)
|
285
294
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
286
295
|
assert task.best_resources is not None, task
|
287
296
|
|
@@ -320,7 +329,7 @@ def _execute(
|
|
320
329
|
(task.file_mounts is not None or
|
321
330
|
task.storage_mounts is not None))
|
322
331
|
if do_workdir or do_file_mounts:
|
323
|
-
logger.info(ux_utils.starting_message('
|
332
|
+
logger.info(ux_utils.starting_message('Syncing files.'))
|
324
333
|
|
325
334
|
if do_workdir:
|
326
335
|
backend.sync_workdir(handle, task.workdir)
|
@@ -374,20 +383,19 @@ def launch(
|
|
374
383
|
down: bool = False,
|
375
384
|
stream_logs: bool = True,
|
376
385
|
backend: Optional[backends.Backend] = None,
|
377
|
-
optimize_target:
|
378
|
-
detach_setup: bool = False,
|
379
|
-
detach_run: bool = False,
|
386
|
+
optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
|
380
387
|
no_setup: bool = False,
|
381
388
|
clone_disk_from: Optional[str] = None,
|
382
389
|
fast: bool = False,
|
383
390
|
# Internal only:
|
384
391
|
# pylint: disable=invalid-name
|
392
|
+
_quiet_optimizer: bool = False,
|
385
393
|
_is_launched_by_jobs_controller: bool = False,
|
386
394
|
_is_launched_by_sky_serve_controller: bool = False,
|
387
395
|
_disable_controller_check: bool = False,
|
388
396
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
389
397
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
390
|
-
"""
|
398
|
+
"""Launches a cluster or task.
|
391
399
|
|
392
400
|
The task's setup and run commands are executed under the task's workdir
|
393
401
|
(when specified, it is synced to remote cluster). The task undergoes job
|
@@ -397,6 +405,16 @@ def launch(
|
|
397
405
|
usage) a sky.Dag. In the latter case, currently it must contain a single
|
398
406
|
task; support for pipelines/general DAGs are in experimental branches.
|
399
407
|
|
408
|
+
Example:
|
409
|
+
.. code-block:: python
|
410
|
+
|
411
|
+
import sky
|
412
|
+
task = sky.Task(run='echo hello SkyPilot')
|
413
|
+
task.set_resources(
|
414
|
+
sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
|
415
|
+
sky.launch(task, cluster_name='my-cluster')
|
416
|
+
|
417
|
+
|
400
418
|
Args:
|
401
419
|
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch.
|
402
420
|
cluster_name: name of the cluster to create/reuse. If None,
|
@@ -408,7 +426,7 @@ def launch(
|
|
408
426
|
cluster's job queue. Idleness gets reset whenever setting-up/
|
409
427
|
running/pending jobs are found in the job queue. Setting this
|
410
428
|
flag is equivalent to running
|
411
|
-
``sky.launch(
|
429
|
+
``sky.launch(...)`` and then
|
412
430
|
``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
|
413
431
|
will not be autostopped.
|
414
432
|
down: Tear down the cluster after all jobs finish (successfully or
|
@@ -422,14 +440,6 @@ def launch(
|
|
422
440
|
(CloudVMRayBackend).
|
423
441
|
optimize_target: target to optimize for. Choices: OptimizeTarget.COST,
|
424
442
|
OptimizeTarget.TIME.
|
425
|
-
detach_setup: If True, run setup in non-interactive mode as part of the
|
426
|
-
job itself. You can safely ctrl-c to detach from logging, and it
|
427
|
-
will not interrupt the setup process. To see the logs again after
|
428
|
-
detaching, use `sky logs`. To cancel setup, cancel the job via
|
429
|
-
`sky cancel`. Useful for long-running setup
|
430
|
-
commands.
|
431
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
432
|
-
function and do not stream execution logs.
|
433
443
|
no_setup: if True, do not re-run setup commands.
|
434
444
|
clone_disk_from: [Experimental] if set, clone the disk from the
|
435
445
|
specified cluster. This is useful to migrate the cluster to a
|
@@ -437,15 +447,6 @@ def launch(
|
|
437
447
|
fast: [Experimental] If the cluster is already up and available,
|
438
448
|
skip provisioning and setup steps.
|
439
449
|
|
440
|
-
Example:
|
441
|
-
.. code-block:: python
|
442
|
-
|
443
|
-
import sky
|
444
|
-
task = sky.Task(run='echo hello SkyPilot')
|
445
|
-
task.set_resources(
|
446
|
-
sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
|
447
|
-
sky.launch(task, cluster_name='my-cluster')
|
448
|
-
|
449
450
|
Raises:
|
450
451
|
exceptions.ClusterOwnerIdentityMismatchError: if the cluster is
|
451
452
|
owned by another user.
|
@@ -474,7 +475,9 @@ def launch(
|
|
474
475
|
handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
|
475
476
|
if dryrun.
|
476
477
|
"""
|
478
|
+
|
477
479
|
entrypoint = task
|
480
|
+
entrypoint.validate()
|
478
481
|
if not _disable_controller_check:
|
479
482
|
controller_utils.check_cluster_name_not_controller(
|
480
483
|
cluster_name, operation_str='sky.launch')
|
@@ -537,12 +540,13 @@ def launch(
|
|
537
540
|
optimize_target=optimize_target,
|
538
541
|
stages=stages,
|
539
542
|
cluster_name=cluster_name,
|
540
|
-
detach_setup=
|
541
|
-
detach_run=
|
543
|
+
detach_setup=True,
|
544
|
+
detach_run=True,
|
542
545
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
543
546
|
no_setup=no_setup,
|
544
547
|
clone_disk_from=clone_disk_from,
|
545
548
|
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
549
|
+
_quiet_optimizer=_quiet_optimizer,
|
546
550
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
547
551
|
_is_launched_by_sky_serve_controller=
|
548
552
|
_is_launched_by_sky_serve_controller,
|
@@ -557,10 +561,9 @@ def exec( # pylint: disable=redefined-builtin
|
|
557
561
|
down: bool = False,
|
558
562
|
stream_logs: bool = True,
|
559
563
|
backend: Optional[backends.Backend] = None,
|
560
|
-
detach_run: bool = False,
|
561
564
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
562
565
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
563
|
-
"""
|
566
|
+
"""Executes a task on an existing cluster.
|
564
567
|
|
565
568
|
This function performs two actions:
|
566
569
|
|
@@ -595,8 +598,6 @@ def exec( # pylint: disable=redefined-builtin
|
|
595
598
|
stream_logs: if True, show the logs in the terminal.
|
596
599
|
backend: backend to use. If None, use the default backend
|
597
600
|
(CloudVMRayBackend).
|
598
|
-
detach_run: if True, detach from logging once the task has been
|
599
|
-
submitted.
|
600
601
|
|
601
602
|
Raises:
|
602
603
|
ValueError: if the specified cluster is not in UP status.
|
@@ -613,11 +614,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
613
614
|
if dryrun.
|
614
615
|
"""
|
615
616
|
entrypoint = task
|
616
|
-
|
617
|
-
logger.warning(
|
618
|
-
f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is '
|
619
|
-
'deprecated. Pass sky.Task instead.'
|
620
|
-
f'{colorama.Style.RESET_ALL}')
|
617
|
+
entrypoint.validate(workdir_only=True)
|
621
618
|
controller_utils.check_cluster_name_not_controller(cluster_name,
|
622
619
|
operation_str='sky.exec')
|
623
620
|
|
@@ -638,5 +635,5 @@ def exec( # pylint: disable=redefined-builtin
|
|
638
635
|
Stage.EXEC,
|
639
636
|
],
|
640
637
|
cluster_name=cluster_name,
|
641
|
-
detach_run=
|
638
|
+
detach_run=True,
|
642
639
|
)
|
sky/global_user_state.py
CHANGED
@@ -16,15 +16,20 @@ import typing
|
|
16
16
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
17
17
|
import uuid
|
18
18
|
|
19
|
-
from sky import
|
20
|
-
from sky import
|
19
|
+
from sky import models
|
20
|
+
from sky import sky_logging
|
21
21
|
from sky.utils import common_utils
|
22
22
|
from sky.utils import db_utils
|
23
|
+
from sky.utils import registry
|
24
|
+
from sky.utils import status_lib
|
23
25
|
|
24
26
|
if typing.TYPE_CHECKING:
|
25
27
|
from sky import backends
|
28
|
+
from sky import clouds
|
26
29
|
from sky.data import Storage
|
27
30
|
|
31
|
+
logger = sky_logging.init_logger(__name__)
|
32
|
+
|
28
33
|
_ENABLED_CLOUDS_KEY = 'enabled_clouds'
|
29
34
|
|
30
35
|
_DB_PATH = os.path.expanduser('~/.sky/state.db')
|
@@ -62,7 +67,8 @@ def create_table(cursor, conn):
|
|
62
67
|
storage_mounts_metadata BLOB DEFAULT null,
|
63
68
|
cluster_ever_up INTEGER DEFAULT 0,
|
64
69
|
status_updated_at INTEGER DEFAULT null,
|
65
|
-
config_hash TEXT DEFAULT null
|
70
|
+
config_hash TEXT DEFAULT null,
|
71
|
+
user_hash TEXT DEFAULT null)""")
|
66
72
|
|
67
73
|
# Table for Cluster History
|
68
74
|
# usage_intervals: List[Tuple[int, int]]
|
@@ -85,7 +91,8 @@ def create_table(cursor, conn):
|
|
85
91
|
num_nodes int,
|
86
92
|
requested_resources BLOB,
|
87
93
|
launched_resources BLOB,
|
88
|
-
usage_intervals BLOB
|
94
|
+
usage_intervals BLOB,
|
95
|
+
user_hash TEXT)""")
|
89
96
|
# Table for configs (e.g. enabled clouds)
|
90
97
|
cursor.execute("""\
|
91
98
|
CREATE TABLE IF NOT EXISTS config (
|
@@ -98,6 +105,11 @@ def create_table(cursor, conn):
|
|
98
105
|
handle BLOB,
|
99
106
|
last_use TEXT,
|
100
107
|
status TEXT)""")
|
108
|
+
# Table for User
|
109
|
+
cursor.execute("""\
|
110
|
+
CREATE TABLE IF NOT EXISTS users (
|
111
|
+
id TEXT PRIMARY KEY,
|
112
|
+
name TEXT)""")
|
101
113
|
# For backward compatibility.
|
102
114
|
# TODO(zhwu): Remove this function after all users have migrated to
|
103
115
|
# the latest version of SkyPilot.
|
@@ -111,6 +123,7 @@ def create_table(cursor, conn):
|
|
111
123
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down',
|
112
124
|
'INTEGER DEFAULT 0')
|
113
125
|
|
126
|
+
# The cloud identity that created the cluster.
|
114
127
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'owner', 'TEXT')
|
115
128
|
|
116
129
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash',
|
@@ -132,19 +145,46 @@ def create_table(cursor, conn):
|
|
132
145
|
# clusters were never really UP, setting it to 1 means they won't be
|
133
146
|
# auto-deleted during any failover.
|
134
147
|
value_to_replace_existing_entries=1)
|
135
|
-
|
136
148
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
|
137
149
|
'INTEGER DEFAULT null')
|
150
|
+
db_utils.add_column_to_table(
|
151
|
+
cursor,
|
152
|
+
conn,
|
153
|
+
'clusters',
|
154
|
+
'user_hash',
|
155
|
+
'TEXT DEFAULT null',
|
156
|
+
value_to_replace_existing_entries=common_utils.get_user_hash())
|
157
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
158
|
+
'TEXT DEFAULT null')
|
138
159
|
|
139
160
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
140
161
|
'TEXT DEFAULT null')
|
141
162
|
|
163
|
+
db_utils.add_column_to_table(cursor, conn, 'cluster_history', 'user_hash',
|
164
|
+
'TEXT DEFAULT null')
|
142
165
|
conn.commit()
|
143
166
|
|
144
167
|
|
145
168
|
_DB = db_utils.SQLiteConn(_DB_PATH, create_table)
|
146
169
|
|
147
170
|
|
171
|
+
def add_or_update_user(user: models.User):
|
172
|
+
"""Store the mapping from user hash to user name for display purposes."""
|
173
|
+
if user.name is None:
|
174
|
+
return
|
175
|
+
_DB.cursor.execute('INSERT OR REPLACE INTO users (id, name) VALUES (?, ?)',
|
176
|
+
(user.id, user.name))
|
177
|
+
_DB.conn.commit()
|
178
|
+
|
179
|
+
|
180
|
+
def get_user(user_id: str) -> models.User:
|
181
|
+
row = _DB.cursor.execute('SELECT id, name FROM users WHERE id=?',
|
182
|
+
(user_id,)).fetchone()
|
183
|
+
if row is None:
|
184
|
+
return models.User(id=user_id)
|
185
|
+
return models.User(id=row[0], name=row[1])
|
186
|
+
|
187
|
+
|
148
188
|
def add_or_update_cluster(cluster_name: str,
|
149
189
|
cluster_handle: 'backends.ResourceHandle',
|
150
190
|
requested_resources: Optional[Set[Any]],
|
@@ -165,7 +205,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
165
205
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
166
206
|
handle = pickle.dumps(cluster_handle)
|
167
207
|
cluster_launched_at = int(time.time()) if is_launch else None
|
168
|
-
last_use = common_utils.
|
208
|
+
last_use = common_utils.get_current_command() if is_launch else None
|
169
209
|
status = status_lib.ClusterStatus.INIT
|
170
210
|
if ready:
|
171
211
|
status = status_lib.ClusterStatus.UP
|
@@ -194,6 +234,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
194
234
|
cluster_launched_at = int(time.time())
|
195
235
|
usage_intervals.append((cluster_launched_at, None))
|
196
236
|
|
237
|
+
user_hash = common_utils.get_user_hash()
|
238
|
+
|
197
239
|
_DB.cursor.execute(
|
198
240
|
'INSERT or REPLACE INTO clusters'
|
199
241
|
# All the fields need to exist here, even if they don't need
|
@@ -203,7 +245,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
203
245
|
'(name, launched_at, handle, last_use, status, '
|
204
246
|
'autostop, to_down, metadata, owner, cluster_hash, '
|
205
247
|
'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
|
206
|
-
'config_hash) '
|
248
|
+
'config_hash, user_hash) '
|
207
249
|
'VALUES ('
|
208
250
|
# name
|
209
251
|
'?, '
|
@@ -240,11 +282,14 @@ def add_or_update_cluster(cluster_name: str,
|
|
240
282
|
'COALESCE('
|
241
283
|
'(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
|
242
284
|
# cluster_ever_up
|
243
|
-
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
|
285
|
+
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?), '
|
244
286
|
# status_updated_at
|
245
287
|
'?,'
|
246
288
|
# config_hash
|
247
|
-
'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?))'
|
289
|
+
'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?)),'
|
290
|
+
# user_hash: keep original user_hash if it exists
|
291
|
+
'COALESCE('
|
292
|
+
'(SELECT user_hash FROM clusters WHERE name=?), ?)'
|
248
293
|
')',
|
249
294
|
(
|
250
295
|
# name
|
@@ -281,6 +326,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
281
326
|
# config_hash
|
282
327
|
config_hash,
|
283
328
|
cluster_name,
|
329
|
+
# user_hash
|
330
|
+
cluster_name,
|
331
|
+
user_hash,
|
284
332
|
))
|
285
333
|
|
286
334
|
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
@@ -288,7 +336,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
288
336
|
_DB.cursor.execute(
|
289
337
|
'INSERT or REPLACE INTO cluster_history'
|
290
338
|
'(cluster_hash, name, num_nodes, requested_resources, '
|
291
|
-
'launched_resources, usage_intervals) '
|
339
|
+
'launched_resources, usage_intervals, user_hash) '
|
292
340
|
'VALUES ('
|
293
341
|
# hash
|
294
342
|
'?, '
|
@@ -301,7 +349,10 @@ def add_or_update_cluster(cluster_name: str,
|
|
301
349
|
# number of nodes
|
302
350
|
'?, '
|
303
351
|
# usage intervals
|
304
|
-
'
|
352
|
+
'?, '
|
353
|
+
# user_hash
|
354
|
+
'?'
|
355
|
+
')',
|
305
356
|
(
|
306
357
|
# hash
|
307
358
|
cluster_hash,
|
@@ -315,15 +366,37 @@ def add_or_update_cluster(cluster_name: str,
|
|
315
366
|
pickle.dumps(launched_resources),
|
316
367
|
# usage intervals
|
317
368
|
pickle.dumps(usage_intervals),
|
369
|
+
# user_hash
|
370
|
+
user_hash,
|
318
371
|
))
|
319
372
|
|
320
373
|
_DB.conn.commit()
|
321
374
|
|
322
375
|
|
376
|
+
def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
377
|
+
"""Returns the user hash or the current user hash, if user_hash is None.
|
378
|
+
|
379
|
+
This is to ensure that the clusters created before the client-server
|
380
|
+
architecture (no user hash info previously) are associated with the current
|
381
|
+
user.
|
382
|
+
"""
|
383
|
+
if user_hash is not None:
|
384
|
+
return user_hash
|
385
|
+
return common_utils.get_user_hash()
|
386
|
+
|
387
|
+
|
388
|
+
def update_cluster_handle(cluster_name: str,
|
389
|
+
cluster_handle: 'backends.ResourceHandle'):
|
390
|
+
handle = pickle.dumps(cluster_handle)
|
391
|
+
_DB.cursor.execute('UPDATE clusters SET handle=(?) WHERE name=(?)',
|
392
|
+
(handle, cluster_name))
|
393
|
+
_DB.conn.commit()
|
394
|
+
|
395
|
+
|
323
396
|
def update_last_use(cluster_name: str):
|
324
397
|
"""Updates the last used command for the cluster."""
|
325
398
|
_DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)',
|
326
|
-
(common_utils.
|
399
|
+
(common_utils.get_current_command(), cluster_name))
|
327
400
|
_DB.conn.commit()
|
328
401
|
|
329
402
|
|
@@ -596,7 +669,7 @@ def get_cluster_from_name(
|
|
596
669
|
rows = _DB.cursor.execute(
|
597
670
|
'SELECT name, launched_at, handle, last_use, status, autostop, '
|
598
671
|
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
599
|
-
'cluster_ever_up, status_updated_at, config_hash '
|
672
|
+
'cluster_ever_up, status_updated_at, config_hash, user_hash '
|
600
673
|
'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
|
601
674
|
for row in rows:
|
602
675
|
# Explicitly specify the number of fields to unpack, so that
|
@@ -604,7 +677,8 @@ def get_cluster_from_name(
|
|
604
677
|
# breaking the previous code.
|
605
678
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
606
679
|
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
607
|
-
status_updated_at, config_hash) = row
|
680
|
+
status_updated_at, config_hash, user_hash) = row
|
681
|
+
user_hash = _get_user_hash_or_current_user(user_hash)
|
608
682
|
# TODO: use namedtuple instead of dict
|
609
683
|
record = {
|
610
684
|
'name': name,
|
@@ -621,6 +695,8 @@ def get_cluster_from_name(
|
|
621
695
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
622
696
|
'cluster_ever_up': bool(cluster_ever_up),
|
623
697
|
'status_updated_at': status_updated_at,
|
698
|
+
'user_hash': user_hash,
|
699
|
+
'user_name': get_user(user_hash).name,
|
624
700
|
'config_hash': config_hash,
|
625
701
|
}
|
626
702
|
return record
|
@@ -631,13 +707,14 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
631
707
|
rows = _DB.cursor.execute(
|
632
708
|
'select name, launched_at, handle, last_use, status, autostop, '
|
633
709
|
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
634
|
-
'cluster_ever_up, status_updated_at, config_hash '
|
710
|
+
'cluster_ever_up, status_updated_at, config_hash, user_hash '
|
635
711
|
'from clusters order by launched_at desc').fetchall()
|
636
712
|
records = []
|
637
713
|
for row in rows:
|
638
714
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
639
715
|
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
640
|
-
status_updated_at, config_hash) = row
|
716
|
+
status_updated_at, config_hash, user_hash) = row
|
717
|
+
user_hash = _get_user_hash_or_current_user(user_hash)
|
641
718
|
# TODO: use namedtuple instead of dict
|
642
719
|
record = {
|
643
720
|
'name': name,
|
@@ -654,6 +731,8 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
654
731
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
655
732
|
'cluster_ever_up': bool(cluster_ever_up),
|
656
733
|
'status_updated_at': status_updated_at,
|
734
|
+
'user_hash': user_hash,
|
735
|
+
'user_name': get_user(user_hash).name,
|
657
736
|
'config_hash': config_hash,
|
658
737
|
}
|
659
738
|
|
@@ -664,7 +743,8 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
664
743
|
def get_clusters_from_history() -> List[Dict[str, Any]]:
|
665
744
|
rows = _DB.cursor.execute(
|
666
745
|
'SELECT ch.cluster_hash, ch.name, ch.num_nodes, '
|
667
|
-
'ch.launched_resources, ch.usage_intervals, clusters.status
|
746
|
+
'ch.launched_resources, ch.usage_intervals, clusters.status, '
|
747
|
+
'ch.user_hash '
|
668
748
|
'FROM cluster_history ch '
|
669
749
|
'LEFT OUTER JOIN clusters '
|
670
750
|
'ON ch.cluster_hash=clusters.cluster_hash ').fetchall()
|
@@ -683,7 +763,9 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
|
|
683
763
|
launched_resources,
|
684
764
|
usage_intervals,
|
685
765
|
status,
|
686
|
-
|
766
|
+
user_hash,
|
767
|
+
) = row[:7]
|
768
|
+
user_hash = _get_user_hash_or_current_user(user_hash)
|
687
769
|
|
688
770
|
if status is not None:
|
689
771
|
status = status_lib.ClusterStatus[status]
|
@@ -697,6 +779,7 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
|
|
697
779
|
'cluster_hash': cluster_hash,
|
698
780
|
'usage_intervals': pickle.loads(usage_intervals),
|
699
781
|
'status': status,
|
782
|
+
'user_hash': user_hash,
|
700
783
|
}
|
701
784
|
|
702
785
|
records.append(record)
|
@@ -712,17 +795,17 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
|
712
795
|
return [row[0] for row in rows]
|
713
796
|
|
714
797
|
|
715
|
-
def get_cached_enabled_clouds() -> List[clouds.Cloud]:
|
798
|
+
def get_cached_enabled_clouds() -> List['clouds.Cloud']:
|
716
799
|
rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
|
717
800
|
(_ENABLED_CLOUDS_KEY,))
|
718
801
|
ret = []
|
719
802
|
for (value,) in rows:
|
720
803
|
ret = json.loads(value)
|
721
804
|
break
|
722
|
-
enabled_clouds: List[clouds.Cloud] = []
|
805
|
+
enabled_clouds: List['clouds.Cloud'] = []
|
723
806
|
for c in ret:
|
724
807
|
try:
|
725
|
-
cloud =
|
808
|
+
cloud = registry.CLOUD_REGISTRY.from_str(c)
|
726
809
|
except ValueError:
|
727
810
|
# Handle the case for the clouds whose support has been removed from
|
728
811
|
# SkyPilot, e.g., 'local' was a cloud in the past and may be stored
|
@@ -745,7 +828,7 @@ def add_or_update_storage(storage_name: str,
|
|
745
828
|
storage_status: status_lib.StorageStatus):
|
746
829
|
storage_launched_at = int(time.time())
|
747
830
|
handle = pickle.dumps(storage_handle)
|
748
|
-
last_use = common_utils.
|
831
|
+
last_use = common_utils.get_current_command()
|
749
832
|
|
750
833
|
def status_check(status):
|
751
834
|
return status in status_lib.StorageStatus
|
sky/jobs/__init__.py
CHANGED
@@ -1,47 +1,45 @@
|
|
1
1
|
"""Managed jobs."""
|
2
2
|
import pathlib
|
3
3
|
|
4
|
+
from sky.jobs.client.sdk import cancel
|
5
|
+
from sky.jobs.client.sdk import dashboard
|
6
|
+
from sky.jobs.client.sdk import download_logs
|
7
|
+
from sky.jobs.client.sdk import launch
|
8
|
+
from sky.jobs.client.sdk import queue
|
9
|
+
from sky.jobs.client.sdk import tail_logs
|
4
10
|
from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
|
11
|
+
from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
|
5
12
|
from sky.jobs.constants import JOBS_CONTROLLER_TEMPLATE
|
6
13
|
from sky.jobs.constants import JOBS_CONTROLLER_YAML_PREFIX
|
7
14
|
from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
|
8
|
-
from sky.jobs.
|
9
|
-
from sky.jobs.core import launch
|
10
|
-
from sky.jobs.core import queue
|
11
|
-
from sky.jobs.core import queue_from_kubernetes_pod
|
12
|
-
from sky.jobs.core import sync_down_logs
|
13
|
-
from sky.jobs.core import tail_logs
|
14
|
-
from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
|
15
|
-
from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
|
15
|
+
from sky.jobs.recovery_strategy import StrategyExecutor
|
16
16
|
from sky.jobs.state import ManagedJobStatus
|
17
17
|
from sky.jobs.utils import dump_managed_job_queue
|
18
18
|
from sky.jobs.utils import format_job_table
|
19
|
-
from sky.jobs.utils import JOB_CONTROLLER_NAME
|
20
19
|
from sky.jobs.utils import load_managed_job_queue
|
21
20
|
from sky.jobs.utils import ManagedJobCodeGen
|
22
21
|
|
23
22
|
pathlib.Path(JOBS_TASK_YAML_PREFIX).expanduser().parent.mkdir(parents=True,
|
24
23
|
exist_ok=True)
|
25
24
|
__all__ = [
|
26
|
-
'RECOVERY_STRATEGIES',
|
27
|
-
'DEFAULT_RECOVERY_STRATEGY',
|
28
|
-
'JOB_CONTROLLER_NAME',
|
29
25
|
# Constants
|
30
26
|
'JOBS_CONTROLLER_TEMPLATE',
|
31
27
|
'JOBS_CONTROLLER_YAML_PREFIX',
|
32
28
|
'JOBS_TASK_YAML_PREFIX',
|
29
|
+
'JOBS_CONTROLLER_LOGS_DIR',
|
33
30
|
# Enums
|
34
31
|
'ManagedJobStatus',
|
35
32
|
# Core
|
36
33
|
'cancel',
|
37
34
|
'launch',
|
38
35
|
'queue',
|
39
|
-
'queue_from_kubernetes_pod',
|
40
36
|
'tail_logs',
|
41
|
-
'
|
37
|
+
'dashboard',
|
38
|
+
'download_logs',
|
42
39
|
# utils
|
43
40
|
'ManagedJobCodeGen',
|
44
41
|
'format_job_table',
|
45
42
|
'dump_managed_job_queue',
|
46
43
|
'load_managed_job_queue',
|
44
|
+
'StrategyExecutor',
|
47
45
|
]
|
File without changes
|