skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +2 -1
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/utils/oci_utils.py +1 -1
  38. sky/clouds/vast.py +2 -1
  39. sky/clouds/vsphere.py +2 -1
  40. sky/core.py +263 -99
  41. sky/dag.py +4 -0
  42. sky/data/mounting_utils.py +2 -1
  43. sky/data/storage.py +97 -35
  44. sky/data/storage_utils.py +69 -9
  45. sky/exceptions.py +138 -5
  46. sky/execution.py +47 -50
  47. sky/global_user_state.py +105 -22
  48. sky/jobs/__init__.py +12 -14
  49. sky/jobs/client/__init__.py +0 -0
  50. sky/jobs/client/sdk.py +296 -0
  51. sky/jobs/constants.py +30 -1
  52. sky/jobs/controller.py +12 -6
  53. sky/jobs/dashboard/dashboard.py +2 -6
  54. sky/jobs/recovery_strategy.py +22 -29
  55. sky/jobs/server/__init__.py +1 -0
  56. sky/jobs/{core.py → server/core.py} +101 -34
  57. sky/jobs/server/dashboard_utils.py +64 -0
  58. sky/jobs/server/server.py +182 -0
  59. sky/jobs/utils.py +32 -23
  60. sky/models.py +27 -0
  61. sky/optimizer.py +9 -11
  62. sky/provision/__init__.py +6 -3
  63. sky/provision/aws/config.py +2 -2
  64. sky/provision/aws/instance.py +1 -1
  65. sky/provision/azure/instance.py +1 -1
  66. sky/provision/cudo/instance.py +1 -1
  67. sky/provision/do/instance.py +1 -1
  68. sky/provision/do/utils.py +0 -5
  69. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  70. sky/provision/fluidstack/instance.py +4 -2
  71. sky/provision/gcp/instance.py +1 -1
  72. sky/provision/instance_setup.py +2 -2
  73. sky/provision/kubernetes/constants.py +8 -0
  74. sky/provision/kubernetes/instance.py +1 -1
  75. sky/provision/kubernetes/utils.py +67 -76
  76. sky/provision/lambda_cloud/instance.py +3 -15
  77. sky/provision/logging.py +1 -1
  78. sky/provision/oci/instance.py +7 -4
  79. sky/provision/paperspace/instance.py +1 -1
  80. sky/provision/provisioner.py +3 -2
  81. sky/provision/runpod/instance.py +1 -1
  82. sky/provision/vast/instance.py +1 -1
  83. sky/provision/vast/utils.py +2 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +55 -40
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +441 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/execution.py CHANGED
@@ -3,28 +3,32 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
+ import typing
6
7
  from typing import List, Optional, Tuple, Union
7
8
 
8
9
  import colorama
9
10
 
10
- import sky
11
11
  from sky import admin_policy
12
12
  from sky import backends
13
13
  from sky import clouds
14
14
  from sky import global_user_state
15
15
  from sky import optimizer
16
16
  from sky import sky_logging
17
- from sky import status_lib
18
17
  from sky.backends import backend_utils
19
18
  from sky.usage import usage_lib
20
19
  from sky.utils import admin_policy_utils
20
+ from sky.utils import common
21
21
  from sky.utils import controller_utils
22
22
  from sky.utils import dag_utils
23
23
  from sky.utils import resources_utils
24
24
  from sky.utils import rich_utils
25
+ from sky.utils import status_lib
25
26
  from sky.utils import timeline
26
27
  from sky.utils import ux_utils
27
28
 
29
+ if typing.TYPE_CHECKING:
30
+ import sky
31
+
28
32
  logger = sky_logging.init_logger(__name__)
29
33
 
30
34
 
@@ -100,7 +104,7 @@ def _execute(
100
104
  handle: Optional[backends.ResourceHandle] = None,
101
105
  backend: Optional[backends.Backend] = None,
102
106
  retry_until_up: bool = False,
103
- optimize_target: optimizer.OptimizeTarget = optimizer.OptimizeTarget.COST,
107
+ optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
104
108
  stages: Optional[List[Stage]] = None,
105
109
  cluster_name: Optional[str] = None,
106
110
  detach_setup: bool = False,
@@ -111,6 +115,7 @@ def _execute(
111
115
  skip_unnecessary_provisioning: bool = False,
112
116
  # Internal only:
113
117
  # pylint: disable=invalid-name
118
+ _quiet_optimizer: bool = False,
114
119
  _is_launched_by_jobs_controller: bool = False,
115
120
  _is_launched_by_sky_serve_controller: bool = False,
116
121
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
@@ -167,16 +172,19 @@ def _execute(
167
172
  """
168
173
 
169
174
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
170
- if not dag.policy_applied:
171
- dag, _ = admin_policy_utils.apply(
172
- dag,
173
- request_options=admin_policy.RequestOptions(
174
- cluster_name=cluster_name,
175
- idle_minutes_to_autostop=idle_minutes_to_autostop,
176
- down=down,
177
- dryrun=dryrun,
178
- ),
179
- )
175
+ for task in dag.tasks:
176
+ if task.storage_mounts is not None:
177
+ for storage in task.storage_mounts.values():
178
+ # Ensure the storage is constructed.
179
+ storage.construct()
180
+ dag, _ = admin_policy_utils.apply(
181
+ dag,
182
+ request_options=admin_policy.RequestOptions(
183
+ cluster_name=cluster_name,
184
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
185
+ down=down,
186
+ dryrun=dryrun,
187
+ ))
180
188
  assert len(dag) == 1, f'We support 1 task for now. {dag}'
181
189
  task = dag.tasks[0]
182
190
 
@@ -274,14 +282,15 @@ def _execute(
274
282
  # no-credential machine should not enter optimize(), which
275
283
  # would directly error out ('No cloud is enabled...'). Fix
276
284
  # by moving `sky check` checks out of optimize()?
277
-
278
285
  controller = controller_utils.Controllers.from_name(
279
286
  cluster_name)
280
287
  if controller is not None:
281
288
  logger.info(
282
289
  f'Choosing resources for {controller.value.name}...'
283
290
  )
284
- dag = sky.optimize(dag, minimize=optimize_target)
291
+ dag = optimizer.Optimizer.optimize(dag,
292
+ minimize=optimize_target,
293
+ quiet=_quiet_optimizer)
285
294
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
286
295
  assert task.best_resources is not None, task
287
296
 
@@ -320,7 +329,7 @@ def _execute(
320
329
  (task.file_mounts is not None or
321
330
  task.storage_mounts is not None))
322
331
  if do_workdir or do_file_mounts:
323
- logger.info(ux_utils.starting_message('Mounting files.'))
332
+ logger.info(ux_utils.starting_message('Syncing files.'))
324
333
 
325
334
  if do_workdir:
326
335
  backend.sync_workdir(handle, task.workdir)
@@ -374,20 +383,19 @@ def launch(
374
383
  down: bool = False,
375
384
  stream_logs: bool = True,
376
385
  backend: Optional[backends.Backend] = None,
377
- optimize_target: optimizer.OptimizeTarget = optimizer.OptimizeTarget.COST,
378
- detach_setup: bool = False,
379
- detach_run: bool = False,
386
+ optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
380
387
  no_setup: bool = False,
381
388
  clone_disk_from: Optional[str] = None,
382
389
  fast: bool = False,
383
390
  # Internal only:
384
391
  # pylint: disable=invalid-name
392
+ _quiet_optimizer: bool = False,
385
393
  _is_launched_by_jobs_controller: bool = False,
386
394
  _is_launched_by_sky_serve_controller: bool = False,
387
395
  _disable_controller_check: bool = False,
388
396
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
389
397
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
390
- """Launch a cluster or task.
398
+ """Launches a cluster or task.
391
399
 
392
400
  The task's setup and run commands are executed under the task's workdir
393
401
  (when specified, it is synced to remote cluster). The task undergoes job
@@ -397,6 +405,16 @@ def launch(
397
405
  usage) a sky.Dag. In the latter case, currently it must contain a single
398
406
  task; support for pipelines/general DAGs are in experimental branches.
399
407
 
408
+ Example:
409
+ .. code-block:: python
410
+
411
+ import sky
412
+ task = sky.Task(run='echo hello SkyPilot')
413
+ task.set_resources(
414
+ sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
415
+ sky.launch(task, cluster_name='my-cluster')
416
+
417
+
400
418
  Args:
401
419
  task: sky.Task, or sky.Dag (experimental; 1-task only) to launch.
402
420
  cluster_name: name of the cluster to create/reuse. If None,
@@ -408,7 +426,7 @@ def launch(
408
426
  cluster's job queue. Idleness gets reset whenever setting-up/
409
427
  running/pending jobs are found in the job queue. Setting this
410
428
  flag is equivalent to running
411
- ``sky.launch(..., detach_run=True, ...)`` and then
429
+ ``sky.launch(...)`` and then
412
430
  ``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
413
431
  will not be autostopped.
414
432
  down: Tear down the cluster after all jobs finish (successfully or
@@ -422,14 +440,6 @@ def launch(
422
440
  (CloudVMRayBackend).
423
441
  optimize_target: target to optimize for. Choices: OptimizeTarget.COST,
424
442
  OptimizeTarget.TIME.
425
- detach_setup: If True, run setup in non-interactive mode as part of the
426
- job itself. You can safely ctrl-c to detach from logging, and it
427
- will not interrupt the setup process. To see the logs again after
428
- detaching, use `sky logs`. To cancel setup, cancel the job via
429
- `sky cancel`. Useful for long-running setup
430
- commands.
431
- detach_run: If True, as soon as a job is submitted, return from this
432
- function and do not stream execution logs.
433
443
  no_setup: if True, do not re-run setup commands.
434
444
  clone_disk_from: [Experimental] if set, clone the disk from the
435
445
  specified cluster. This is useful to migrate the cluster to a
@@ -437,15 +447,6 @@ def launch(
437
447
  fast: [Experimental] If the cluster is already up and available,
438
448
  skip provisioning and setup steps.
439
449
 
440
- Example:
441
- .. code-block:: python
442
-
443
- import sky
444
- task = sky.Task(run='echo hello SkyPilot')
445
- task.set_resources(
446
- sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
447
- sky.launch(task, cluster_name='my-cluster')
448
-
449
450
  Raises:
450
451
  exceptions.ClusterOwnerIdentityMismatchError: if the cluster is
451
452
  owned by another user.
@@ -474,7 +475,9 @@ def launch(
474
475
  handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
475
476
  if dryrun.
476
477
  """
478
+
477
479
  entrypoint = task
480
+ entrypoint.validate()
478
481
  if not _disable_controller_check:
479
482
  controller_utils.check_cluster_name_not_controller(
480
483
  cluster_name, operation_str='sky.launch')
@@ -537,12 +540,13 @@ def launch(
537
540
  optimize_target=optimize_target,
538
541
  stages=stages,
539
542
  cluster_name=cluster_name,
540
- detach_setup=detach_setup,
541
- detach_run=detach_run,
543
+ detach_setup=True,
544
+ detach_run=True,
542
545
  idle_minutes_to_autostop=idle_minutes_to_autostop,
543
546
  no_setup=no_setup,
544
547
  clone_disk_from=clone_disk_from,
545
548
  skip_unnecessary_provisioning=skip_unnecessary_provisioning,
549
+ _quiet_optimizer=_quiet_optimizer,
546
550
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
547
551
  _is_launched_by_sky_serve_controller=
548
552
  _is_launched_by_sky_serve_controller,
@@ -557,10 +561,9 @@ def exec( # pylint: disable=redefined-builtin
557
561
  down: bool = False,
558
562
  stream_logs: bool = True,
559
563
  backend: Optional[backends.Backend] = None,
560
- detach_run: bool = False,
561
564
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
562
565
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
563
- """Execute a task on an existing cluster.
566
+ """Executes a task on an existing cluster.
564
567
 
565
568
  This function performs two actions:
566
569
 
@@ -595,8 +598,6 @@ def exec( # pylint: disable=redefined-builtin
595
598
  stream_logs: if True, show the logs in the terminal.
596
599
  backend: backend to use. If None, use the default backend
597
600
  (CloudVMRayBackend).
598
- detach_run: if True, detach from logging once the task has been
599
- submitted.
600
601
 
601
602
  Raises:
602
603
  ValueError: if the specified cluster is not in UP status.
@@ -613,11 +614,7 @@ def exec( # pylint: disable=redefined-builtin
613
614
  if dryrun.
614
615
  """
615
616
  entrypoint = task
616
- if isinstance(entrypoint, sky.Dag):
617
- logger.warning(
618
- f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is '
619
- 'deprecated. Pass sky.Task instead.'
620
- f'{colorama.Style.RESET_ALL}')
617
+ entrypoint.validate(workdir_only=True)
621
618
  controller_utils.check_cluster_name_not_controller(cluster_name,
622
619
  operation_str='sky.exec')
623
620
 
@@ -638,5 +635,5 @@ def exec( # pylint: disable=redefined-builtin
638
635
  Stage.EXEC,
639
636
  ],
640
637
  cluster_name=cluster_name,
641
- detach_run=detach_run,
638
+ detach_run=True,
642
639
  )
sky/global_user_state.py CHANGED
@@ -16,15 +16,20 @@ import typing
16
16
  from typing import Any, Dict, List, Optional, Set, Tuple
17
17
  import uuid
18
18
 
19
- from sky import clouds
20
- from sky import status_lib
19
+ from sky import models
20
+ from sky import sky_logging
21
21
  from sky.utils import common_utils
22
22
  from sky.utils import db_utils
23
+ from sky.utils import registry
24
+ from sky.utils import status_lib
23
25
 
24
26
  if typing.TYPE_CHECKING:
25
27
  from sky import backends
28
+ from sky import clouds
26
29
  from sky.data import Storage
27
30
 
31
+ logger = sky_logging.init_logger(__name__)
32
+
28
33
  _ENABLED_CLOUDS_KEY = 'enabled_clouds'
29
34
 
30
35
  _DB_PATH = os.path.expanduser('~/.sky/state.db')
@@ -62,7 +67,8 @@ def create_table(cursor, conn):
62
67
  storage_mounts_metadata BLOB DEFAULT null,
63
68
  cluster_ever_up INTEGER DEFAULT 0,
64
69
  status_updated_at INTEGER DEFAULT null,
65
- config_hash TEXT DEFAULT null)""")
70
+ config_hash TEXT DEFAULT null,
71
+ user_hash TEXT DEFAULT null)""")
66
72
 
67
73
  # Table for Cluster History
68
74
  # usage_intervals: List[Tuple[int, int]]
@@ -85,7 +91,8 @@ def create_table(cursor, conn):
85
91
  num_nodes int,
86
92
  requested_resources BLOB,
87
93
  launched_resources BLOB,
88
- usage_intervals BLOB)""")
94
+ usage_intervals BLOB,
95
+ user_hash TEXT)""")
89
96
  # Table for configs (e.g. enabled clouds)
90
97
  cursor.execute("""\
91
98
  CREATE TABLE IF NOT EXISTS config (
@@ -98,6 +105,11 @@ def create_table(cursor, conn):
98
105
  handle BLOB,
99
106
  last_use TEXT,
100
107
  status TEXT)""")
108
+ # Table for User
109
+ cursor.execute("""\
110
+ CREATE TABLE IF NOT EXISTS users (
111
+ id TEXT PRIMARY KEY,
112
+ name TEXT)""")
101
113
  # For backward compatibility.
102
114
  # TODO(zhwu): Remove this function after all users have migrated to
103
115
  # the latest version of SkyPilot.
@@ -111,6 +123,7 @@ def create_table(cursor, conn):
111
123
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down',
112
124
  'INTEGER DEFAULT 0')
113
125
 
126
+ # The cloud identity that created the cluster.
114
127
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'owner', 'TEXT')
115
128
 
116
129
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash',
@@ -132,19 +145,46 @@ def create_table(cursor, conn):
132
145
  # clusters were never really UP, setting it to 1 means they won't be
133
146
  # auto-deleted during any failover.
134
147
  value_to_replace_existing_entries=1)
135
-
136
148
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
137
149
  'INTEGER DEFAULT null')
150
+ db_utils.add_column_to_table(
151
+ cursor,
152
+ conn,
153
+ 'clusters',
154
+ 'user_hash',
155
+ 'TEXT DEFAULT null',
156
+ value_to_replace_existing_entries=common_utils.get_user_hash())
157
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
158
+ 'TEXT DEFAULT null')
138
159
 
139
160
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
140
161
  'TEXT DEFAULT null')
141
162
 
163
+ db_utils.add_column_to_table(cursor, conn, 'cluster_history', 'user_hash',
164
+ 'TEXT DEFAULT null')
142
165
  conn.commit()
143
166
 
144
167
 
145
168
  _DB = db_utils.SQLiteConn(_DB_PATH, create_table)
146
169
 
147
170
 
171
+ def add_or_update_user(user: models.User):
172
+ """Store the mapping from user hash to user name for display purposes."""
173
+ if user.name is None:
174
+ return
175
+ _DB.cursor.execute('INSERT OR REPLACE INTO users (id, name) VALUES (?, ?)',
176
+ (user.id, user.name))
177
+ _DB.conn.commit()
178
+
179
+
180
+ def get_user(user_id: str) -> models.User:
181
+ row = _DB.cursor.execute('SELECT id, name FROM users WHERE id=?',
182
+ (user_id,)).fetchone()
183
+ if row is None:
184
+ return models.User(id=user_id)
185
+ return models.User(id=row[0], name=row[1])
186
+
187
+
148
188
  def add_or_update_cluster(cluster_name: str,
149
189
  cluster_handle: 'backends.ResourceHandle',
150
190
  requested_resources: Optional[Set[Any]],
@@ -165,7 +205,7 @@ def add_or_update_cluster(cluster_name: str,
165
205
  # FIXME: launched_at will be changed when `sky launch -c` is called.
166
206
  handle = pickle.dumps(cluster_handle)
167
207
  cluster_launched_at = int(time.time()) if is_launch else None
168
- last_use = common_utils.get_pretty_entry_point() if is_launch else None
208
+ last_use = common_utils.get_current_command() if is_launch else None
169
209
  status = status_lib.ClusterStatus.INIT
170
210
  if ready:
171
211
  status = status_lib.ClusterStatus.UP
@@ -194,6 +234,8 @@ def add_or_update_cluster(cluster_name: str,
194
234
  cluster_launched_at = int(time.time())
195
235
  usage_intervals.append((cluster_launched_at, None))
196
236
 
237
+ user_hash = common_utils.get_user_hash()
238
+
197
239
  _DB.cursor.execute(
198
240
  'INSERT or REPLACE INTO clusters'
199
241
  # All the fields need to exist here, even if they don't need
@@ -203,7 +245,7 @@ def add_or_update_cluster(cluster_name: str,
203
245
  '(name, launched_at, handle, last_use, status, '
204
246
  'autostop, to_down, metadata, owner, cluster_hash, '
205
247
  'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
206
- 'config_hash) '
248
+ 'config_hash, user_hash) '
207
249
  'VALUES ('
208
250
  # name
209
251
  '?, '
@@ -240,11 +282,14 @@ def add_or_update_cluster(cluster_name: str,
240
282
  'COALESCE('
241
283
  '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
242
284
  # cluster_ever_up
243
- '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
285
+ '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?), '
244
286
  # status_updated_at
245
287
  '?,'
246
288
  # config_hash
247
- 'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?))'
289
+ 'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?)),'
290
+ # user_hash: keep original user_hash if it exists
291
+ 'COALESCE('
292
+ '(SELECT user_hash FROM clusters WHERE name=?), ?)'
248
293
  ')',
249
294
  (
250
295
  # name
@@ -281,6 +326,9 @@ def add_or_update_cluster(cluster_name: str,
281
326
  # config_hash
282
327
  config_hash,
283
328
  cluster_name,
329
+ # user_hash
330
+ cluster_name,
331
+ user_hash,
284
332
  ))
285
333
 
286
334
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -288,7 +336,7 @@ def add_or_update_cluster(cluster_name: str,
288
336
  _DB.cursor.execute(
289
337
  'INSERT or REPLACE INTO cluster_history'
290
338
  '(cluster_hash, name, num_nodes, requested_resources, '
291
- 'launched_resources, usage_intervals) '
339
+ 'launched_resources, usage_intervals, user_hash) '
292
340
  'VALUES ('
293
341
  # hash
294
342
  '?, '
@@ -301,7 +349,10 @@ def add_or_update_cluster(cluster_name: str,
301
349
  # number of nodes
302
350
  '?, '
303
351
  # usage intervals
304
- '?)',
352
+ '?, '
353
+ # user_hash
354
+ '?'
355
+ ')',
305
356
  (
306
357
  # hash
307
358
  cluster_hash,
@@ -315,15 +366,37 @@ def add_or_update_cluster(cluster_name: str,
315
366
  pickle.dumps(launched_resources),
316
367
  # usage intervals
317
368
  pickle.dumps(usage_intervals),
369
+ # user_hash
370
+ user_hash,
318
371
  ))
319
372
 
320
373
  _DB.conn.commit()
321
374
 
322
375
 
376
+ def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
377
+ """Returns the user hash or the current user hash, if user_hash is None.
378
+
379
+ This is to ensure that the clusters created before the client-server
380
+ architecture (no user hash info previously) are associated with the current
381
+ user.
382
+ """
383
+ if user_hash is not None:
384
+ return user_hash
385
+ return common_utils.get_user_hash()
386
+
387
+
388
+ def update_cluster_handle(cluster_name: str,
389
+ cluster_handle: 'backends.ResourceHandle'):
390
+ handle = pickle.dumps(cluster_handle)
391
+ _DB.cursor.execute('UPDATE clusters SET handle=(?) WHERE name=(?)',
392
+ (handle, cluster_name))
393
+ _DB.conn.commit()
394
+
395
+
323
396
  def update_last_use(cluster_name: str):
324
397
  """Updates the last used command for the cluster."""
325
398
  _DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)',
326
- (common_utils.get_pretty_entry_point(), cluster_name))
399
+ (common_utils.get_current_command(), cluster_name))
327
400
  _DB.conn.commit()
328
401
 
329
402
 
@@ -596,7 +669,7 @@ def get_cluster_from_name(
596
669
  rows = _DB.cursor.execute(
597
670
  'SELECT name, launched_at, handle, last_use, status, autostop, '
598
671
  'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
599
- 'cluster_ever_up, status_updated_at, config_hash '
672
+ 'cluster_ever_up, status_updated_at, config_hash, user_hash '
600
673
  'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
601
674
  for row in rows:
602
675
  # Explicitly specify the number of fields to unpack, so that
@@ -604,7 +677,8 @@ def get_cluster_from_name(
604
677
  # breaking the previous code.
605
678
  (name, launched_at, handle, last_use, status, autostop, metadata,
606
679
  to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
607
- status_updated_at, config_hash) = row[:14]
680
+ status_updated_at, config_hash, user_hash) = row
681
+ user_hash = _get_user_hash_or_current_user(user_hash)
608
682
  # TODO: use namedtuple instead of dict
609
683
  record = {
610
684
  'name': name,
@@ -621,6 +695,8 @@ def get_cluster_from_name(
621
695
  _load_storage_mounts_metadata(storage_mounts_metadata),
622
696
  'cluster_ever_up': bool(cluster_ever_up),
623
697
  'status_updated_at': status_updated_at,
698
+ 'user_hash': user_hash,
699
+ 'user_name': get_user(user_hash).name,
624
700
  'config_hash': config_hash,
625
701
  }
626
702
  return record
@@ -631,13 +707,14 @@ def get_clusters() -> List[Dict[str, Any]]:
631
707
  rows = _DB.cursor.execute(
632
708
  'select name, launched_at, handle, last_use, status, autostop, '
633
709
  'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
634
- 'cluster_ever_up, status_updated_at, config_hash '
710
+ 'cluster_ever_up, status_updated_at, config_hash, user_hash '
635
711
  'from clusters order by launched_at desc').fetchall()
636
712
  records = []
637
713
  for row in rows:
638
714
  (name, launched_at, handle, last_use, status, autostop, metadata,
639
715
  to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
640
- status_updated_at, config_hash) = row[:14]
716
+ status_updated_at, config_hash, user_hash) = row
717
+ user_hash = _get_user_hash_or_current_user(user_hash)
641
718
  # TODO: use namedtuple instead of dict
642
719
  record = {
643
720
  'name': name,
@@ -654,6 +731,8 @@ def get_clusters() -> List[Dict[str, Any]]:
654
731
  _load_storage_mounts_metadata(storage_mounts_metadata),
655
732
  'cluster_ever_up': bool(cluster_ever_up),
656
733
  'status_updated_at': status_updated_at,
734
+ 'user_hash': user_hash,
735
+ 'user_name': get_user(user_hash).name,
657
736
  'config_hash': config_hash,
658
737
  }
659
738
 
@@ -664,7 +743,8 @@ def get_clusters() -> List[Dict[str, Any]]:
664
743
  def get_clusters_from_history() -> List[Dict[str, Any]]:
665
744
  rows = _DB.cursor.execute(
666
745
  'SELECT ch.cluster_hash, ch.name, ch.num_nodes, '
667
- 'ch.launched_resources, ch.usage_intervals, clusters.status '
746
+ 'ch.launched_resources, ch.usage_intervals, clusters.status, '
747
+ 'ch.user_hash '
668
748
  'FROM cluster_history ch '
669
749
  'LEFT OUTER JOIN clusters '
670
750
  'ON ch.cluster_hash=clusters.cluster_hash ').fetchall()
@@ -683,7 +763,9 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
683
763
  launched_resources,
684
764
  usage_intervals,
685
765
  status,
686
- ) = row[:6]
766
+ user_hash,
767
+ ) = row[:7]
768
+ user_hash = _get_user_hash_or_current_user(user_hash)
687
769
 
688
770
  if status is not None:
689
771
  status = status_lib.ClusterStatus[status]
@@ -697,6 +779,7 @@ def get_clusters_from_history() -> List[Dict[str, Any]]:
697
779
  'cluster_hash': cluster_hash,
698
780
  'usage_intervals': pickle.loads(usage_intervals),
699
781
  'status': status,
782
+ 'user_hash': user_hash,
700
783
  }
701
784
 
702
785
  records.append(record)
@@ -712,17 +795,17 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]:
712
795
  return [row[0] for row in rows]
713
796
 
714
797
 
715
- def get_cached_enabled_clouds() -> List[clouds.Cloud]:
798
+ def get_cached_enabled_clouds() -> List['clouds.Cloud']:
716
799
  rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?',
717
800
  (_ENABLED_CLOUDS_KEY,))
718
801
  ret = []
719
802
  for (value,) in rows:
720
803
  ret = json.loads(value)
721
804
  break
722
- enabled_clouds: List[clouds.Cloud] = []
805
+ enabled_clouds: List['clouds.Cloud'] = []
723
806
  for c in ret:
724
807
  try:
725
- cloud = clouds.CLOUD_REGISTRY.from_str(c)
808
+ cloud = registry.CLOUD_REGISTRY.from_str(c)
726
809
  except ValueError:
727
810
  # Handle the case for the clouds whose support has been removed from
728
811
  # SkyPilot, e.g., 'local' was a cloud in the past and may be stored
@@ -745,7 +828,7 @@ def add_or_update_storage(storage_name: str,
745
828
  storage_status: status_lib.StorageStatus):
746
829
  storage_launched_at = int(time.time())
747
830
  handle = pickle.dumps(storage_handle)
748
- last_use = common_utils.get_pretty_entry_point()
831
+ last_use = common_utils.get_current_command()
749
832
 
750
833
  def status_check(status):
751
834
  return status in status_lib.StorageStatus
sky/jobs/__init__.py CHANGED
@@ -1,47 +1,45 @@
1
1
  """Managed jobs."""
2
2
  import pathlib
3
3
 
4
+ from sky.jobs.client.sdk import cancel
5
+ from sky.jobs.client.sdk import dashboard
6
+ from sky.jobs.client.sdk import download_logs
7
+ from sky.jobs.client.sdk import launch
8
+ from sky.jobs.client.sdk import queue
9
+ from sky.jobs.client.sdk import tail_logs
4
10
  from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
11
+ from sky.jobs.constants import JOBS_CONTROLLER_LOGS_DIR
5
12
  from sky.jobs.constants import JOBS_CONTROLLER_TEMPLATE
6
13
  from sky.jobs.constants import JOBS_CONTROLLER_YAML_PREFIX
7
14
  from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
8
- from sky.jobs.core import cancel
9
- from sky.jobs.core import launch
10
- from sky.jobs.core import queue
11
- from sky.jobs.core import queue_from_kubernetes_pod
12
- from sky.jobs.core import sync_down_logs
13
- from sky.jobs.core import tail_logs
14
- from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
15
- from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
15
+ from sky.jobs.recovery_strategy import StrategyExecutor
16
16
  from sky.jobs.state import ManagedJobStatus
17
17
  from sky.jobs.utils import dump_managed_job_queue
18
18
  from sky.jobs.utils import format_job_table
19
- from sky.jobs.utils import JOB_CONTROLLER_NAME
20
19
  from sky.jobs.utils import load_managed_job_queue
21
20
  from sky.jobs.utils import ManagedJobCodeGen
22
21
 
23
22
  pathlib.Path(JOBS_TASK_YAML_PREFIX).expanduser().parent.mkdir(parents=True,
24
23
  exist_ok=True)
25
24
  __all__ = [
26
- 'RECOVERY_STRATEGIES',
27
- 'DEFAULT_RECOVERY_STRATEGY',
28
- 'JOB_CONTROLLER_NAME',
29
25
  # Constants
30
26
  'JOBS_CONTROLLER_TEMPLATE',
31
27
  'JOBS_CONTROLLER_YAML_PREFIX',
32
28
  'JOBS_TASK_YAML_PREFIX',
29
+ 'JOBS_CONTROLLER_LOGS_DIR',
33
30
  # Enums
34
31
  'ManagedJobStatus',
35
32
  # Core
36
33
  'cancel',
37
34
  'launch',
38
35
  'queue',
39
- 'queue_from_kubernetes_pod',
40
36
  'tail_logs',
41
- 'sync_down_logs',
37
+ 'dashboard',
38
+ 'download_logs',
42
39
  # utils
43
40
  'ManagedJobCodeGen',
44
41
  'format_job_table',
45
42
  'dump_managed_job_queue',
46
43
  'load_managed_job_queue',
44
+ 'StrategyExecutor',
47
45
  ]
File without changes