skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (136) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend.py +5 -3
  4. sky/backends/backend_utils.py +22 -7
  5. sky/backends/cloud_vm_ray_backend.py +50 -18
  6. sky/backends/local_docker_backend.py +8 -3
  7. sky/client/cli/command.py +25 -10
  8. sky/client/sdk.py +51 -1
  9. sky/clouds/kubernetes.py +2 -6
  10. sky/clouds/nebius.py +3 -1
  11. sky/core.py +9 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  19. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  20. sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  23. sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  27. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
  28. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  29. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  32. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
  47. sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
  48. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  49. sky/dashboard/out/clusters/[cluster].html +1 -1
  50. sky/dashboard/out/clusters.html +1 -1
  51. sky/dashboard/out/config.html +1 -1
  52. sky/dashboard/out/index.html +1 -1
  53. sky/dashboard/out/infra/[context].html +1 -1
  54. sky/dashboard/out/infra.html +1 -1
  55. sky/dashboard/out/jobs/[job].html +1 -1
  56. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  57. sky/dashboard/out/jobs.html +1 -1
  58. sky/dashboard/out/users.html +1 -1
  59. sky/dashboard/out/volumes.html +1 -1
  60. sky/dashboard/out/workspace/new.html +1 -1
  61. sky/dashboard/out/workspaces/[name].html +1 -1
  62. sky/dashboard/out/workspaces.html +1 -1
  63. sky/data/storage_utils.py +29 -9
  64. sky/execution.py +13 -10
  65. sky/global_user_state.py +131 -2
  66. sky/jobs/constants.py +1 -1
  67. sky/jobs/recovery_strategy.py +0 -3
  68. sky/jobs/scheduler.py +14 -21
  69. sky/jobs/server/core.py +64 -10
  70. sky/jobs/server/utils.py +1 -1
  71. sky/jobs/state.py +1 -3
  72. sky/jobs/utils.py +159 -11
  73. sky/provision/aws/config.py +19 -3
  74. sky/provision/aws/instance.py +2 -1
  75. sky/provision/kubernetes/instance.py +2 -1
  76. sky/provision/nebius/utils.py +101 -86
  77. sky/provision/provisioner.py +13 -8
  78. sky/resources.py +5 -5
  79. sky/schemas/api/responses.py +50 -1
  80. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  81. sky/serve/replica_managers.py +123 -101
  82. sky/serve/serve_state.py +32 -0
  83. sky/serve/serve_utils.py +37 -16
  84. sky/serve/service.py +51 -17
  85. sky/server/common.py +2 -3
  86. sky/server/constants.py +1 -1
  87. sky/server/requests/payloads.py +6 -0
  88. sky/server/requests/serializers/decoders.py +20 -5
  89. sky/server/requests/serializers/encoders.py +21 -8
  90. sky/server/server.py +57 -11
  91. sky/templates/kubernetes-ray.yml.j2 +1 -0
  92. sky/utils/cli_utils/status_utils.py +2 -1
  93. sky/utils/common_utils.py +20 -0
  94. sky/utils/controller_utils.py +17 -4
  95. sky/utils/db/migration_utils.py +1 -1
  96. sky/utils/log_utils.py +14 -5
  97. sky/utils/resources_utils.py +25 -1
  98. sky/utils/schemas.py +3 -0
  99. sky/utils/ux_utils.py +36 -5
  100. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
  101. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
  102. sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  104. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  105. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  106. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  109. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  110. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  111. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  114. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  117. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  119. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
  121. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
  131. /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
  132. /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
  133. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/data/storage_utils.py CHANGED
@@ -252,17 +252,28 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
252
252
 
253
253
  def zip_files_and_folders(items: List[str],
254
254
  output_file: Union[str, pathlib.Path],
255
- log_file: Optional[TextIO] = None):
255
+ log_file: Optional[TextIO] = None,
256
+ relative_to_items: bool = False):
257
+
258
+ def _get_archive_name(file_path: str, item_path: str) -> str:
259
+ """Get the archive name for a file based on the relative parameters."""
260
+ if relative_to_items:
261
+ # Make paths relative to the item itself
262
+ return os.path.relpath(file_path, os.path.dirname(item_path))
263
+ else:
264
+ # Default: use full path (existing behavior)
265
+ return file_path
256
266
 
257
- def _store_symlink(zipf, path: str, is_dir: bool):
267
+ def _store_symlink(zipf, path: str, archive_name: str, is_dir: bool):
258
268
  # Get the target of the symlink
259
269
  target = os.readlink(path)
260
270
  # Use relative path as absolute path will not be able to resolve on
261
271
  # remote API server.
262
272
  if os.path.isabs(target):
263
273
  target = os.path.relpath(target, os.path.dirname(path))
264
- # Create a ZipInfo instance
265
- zi = zipfile.ZipInfo(path + '/') if is_dir else zipfile.ZipInfo(path)
274
+ # Create a ZipInfo instance using the archive name
275
+ zi = zipfile.ZipInfo(archive_name +
276
+ '/') if is_dir else zipfile.ZipInfo(archive_name)
266
277
  # Set external attributes to mark as symlink
267
278
  zi.external_attr = 0xA1ED0000
268
279
  # Write symlink target as content
@@ -281,7 +292,8 @@ def zip_files_and_folders(items: List[str],
281
292
  # Add the file to the zip archive even if it matches
282
293
  # patterns in dot ignore files, as it was explicitly
283
294
  # specified by user.
284
- zipf.write(item)
295
+ archive_name = _get_archive_name(item, item)
296
+ zipf.write(item, archive_name)
285
297
  elif os.path.isdir(item):
286
298
  excluded_files = set([
287
299
  os.path.join(item, f.rstrip('/'))
@@ -304,21 +316,29 @@ def zip_files_and_folders(items: List[str],
304
316
  # directories)
305
317
  for dir_name in dirs:
306
318
  dir_path = os.path.join(root, dir_name)
319
+ archive_name = _get_archive_name(dir_path, item)
307
320
  # If it's a symlink, store it as a symlink
308
321
  if os.path.islink(dir_path):
309
- _store_symlink(zipf, dir_path, is_dir=True)
322
+ _store_symlink(zipf,
323
+ dir_path,
324
+ archive_name,
325
+ is_dir=True)
310
326
  else:
311
- zipf.write(dir_path)
327
+ zipf.write(dir_path, archive_name)
312
328
 
313
329
  for file in files:
314
330
  file_path = os.path.join(root, file)
315
331
  if file_path in excluded_files:
316
332
  continue
333
+ archive_name = _get_archive_name(file_path, item)
317
334
  if os.path.islink(file_path):
318
- _store_symlink(zipf, file_path, is_dir=False)
335
+ _store_symlink(zipf,
336
+ file_path,
337
+ archive_name,
338
+ is_dir=False)
319
339
  continue
320
340
  if stat.S_ISSOCK(os.stat(file_path).st_mode):
321
341
  continue
322
- zipf.write(file_path)
342
+ zipf.write(file_path, archive_name)
323
343
  if log_file is not None:
324
344
  log_file.write(f'Zipped {item}\n')
sky/execution.py CHANGED
@@ -173,19 +173,12 @@ def _execute(
173
173
  if dryrun.
174
174
  """
175
175
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
176
- dag.resolve_and_validate_volumes()
177
- if (not _is_launched_by_jobs_controller and
178
- not _is_launched_by_sky_serve_controller):
179
- # Only process pre-mount operations on API server.
180
- dag.pre_mount_volumes()
181
176
  for task in dag.tasks:
182
- if task.storage_mounts is not None:
183
- for storage in task.storage_mounts.values():
184
- # Ensure the storage is constructed.
185
- storage.construct()
186
177
  for resource in task.resources:
187
178
  # For backward compatibility, we need to override the autostop
188
- # config at server-side for legacy clients.
179
+ # config at server-side for legacy clients. This should be set
180
+ # before admin policy to make the admin policy get the final
181
+ # value of autostop config.
189
182
  # TODO(aylei): remove this after we bump the API version.
190
183
  resource.override_autostop_config(
191
184
  down=down, idle_minutes=idle_minutes_to_autostop)
@@ -200,6 +193,16 @@ def _execute(
200
193
  down=down,
201
194
  dryrun=dryrun,
202
195
  )) as dag:
196
+ dag.resolve_and_validate_volumes()
197
+ if (not _is_launched_by_jobs_controller and
198
+ not _is_launched_by_sky_serve_controller):
199
+ # Only process pre-mount operations on API server.
200
+ dag.pre_mount_volumes()
201
+ for task in dag.tasks:
202
+ if task.storage_mounts is not None:
203
+ for storage in task.storage_mounts.values():
204
+ # Ensure the storage is constructed.
205
+ storage.construct()
203
206
  return _execute_dag(
204
207
  dag,
205
208
  dryrun=dryrun,
sky/global_user_state.py CHANGED
@@ -6,6 +6,7 @@ Concepts:
6
6
  - Cluster handle: (non-user facing) an opaque backend handle for us to
7
7
  interact with a cluster.
8
8
  """
9
+ import asyncio
9
10
  import enum
10
11
  import functools
11
12
  import json
@@ -51,6 +52,9 @@ _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
51
52
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
52
53
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
53
54
 
55
+ DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
56
+ MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
57
+
54
58
  Base = declarative.declarative_base()
55
59
 
56
60
  config_table = sqlalchemy.Table(
@@ -102,6 +106,9 @@ cluster_table = sqlalchemy.Table(
102
106
  sqlalchemy.Text,
103
107
  server_default=None),
104
108
  sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
109
+ sqlalchemy.Column('provision_log_path',
110
+ sqlalchemy.Text,
111
+ server_default=None),
105
112
  )
106
113
 
107
114
  storage_table = sqlalchemy.Table(
@@ -161,6 +168,9 @@ cluster_history_table = sqlalchemy.Table(
161
168
  sqlalchemy.Text,
162
169
  server_default=None),
163
170
  sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
171
+ sqlalchemy.Column('provision_log_path',
172
+ sqlalchemy.Text,
173
+ server_default=None),
164
174
  )
165
175
 
166
176
 
@@ -430,6 +440,17 @@ def get_user_by_name(username: str) -> List[models.User]:
430
440
  ]
431
441
 
432
442
 
443
+ @_init_db
444
+ def get_user_by_name_match(username_match: str) -> List[models.User]:
445
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
446
+ rows = session.query(user_table).filter(
447
+ user_table.c.name.like(f'%{username_match}%')).all()
448
+ return [
449
+ models.User(id=row.id, name=row.name, created_at=row.created_at)
450
+ for row in rows
451
+ ]
452
+
453
+
433
454
  @_init_db
434
455
  def delete_user(user_id: str) -> None:
435
456
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -458,7 +479,8 @@ def add_or_update_cluster(cluster_name: str,
458
479
  is_launch: bool = True,
459
480
  config_hash: Optional[str] = None,
460
481
  task_config: Optional[Dict[str, Any]] = None,
461
- is_managed: bool = False):
482
+ is_managed: bool = False,
483
+ provision_log_path: Optional[str] = None):
462
484
  """Adds or updates cluster_name -> cluster_handle mapping.
463
485
 
464
486
  Args:
@@ -473,6 +495,7 @@ def add_or_update_cluster(cluster_name: str,
473
495
  task_config: The config of the task being launched.
474
496
  is_managed: Whether the cluster is launched by the
475
497
  controller.
498
+ provision_log_path: Absolute path to provision.log, if available.
476
499
  """
477
500
  assert _SQLALCHEMY_ENGINE is not None
478
501
  # FIXME: launched_at will be changed when `sky launch -c` is called.
@@ -555,6 +578,10 @@ def add_or_update_cluster(cluster_name: str,
555
578
  if task_config else None,
556
579
  'last_creation_command': last_use,
557
580
  })
581
+ if provision_log_path is not None:
582
+ conditional_values.update({
583
+ 'provision_log_path': provision_log_path,
584
+ })
558
585
 
559
586
  if (_SQLALCHEMY_ENGINE.dialect.name ==
560
587
  db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -618,6 +645,7 @@ def add_or_update_cluster(cluster_name: str,
618
645
  usage_intervals=pickle.dumps(usage_intervals),
619
646
  user_hash=user_hash,
620
647
  workspace=history_workspace,
648
+ provision_log_path=provision_log_path,
621
649
  **creation_info,
622
650
  )
623
651
  do_update_stmt = insert_stmnt.on_conflict_do_update(
@@ -633,6 +661,7 @@ def add_or_update_cluster(cluster_name: str,
633
661
  pickle.dumps(usage_intervals),
634
662
  cluster_history_table.c.user_hash: history_hash,
635
663
  cluster_history_table.c.workspace: history_workspace,
664
+ cluster_history_table.c.provision_log_path: provision_log_path,
636
665
  **creation_info,
637
666
  })
638
667
  session.execute(do_update_stmt)
@@ -731,6 +760,41 @@ def get_last_cluster_event(cluster_hash: str,
731
760
  return row.reason
732
761
 
733
762
 
763
+ def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
764
+ assert _SQLALCHEMY_ENGINE is not None
765
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
766
+ query = session.query(cluster_event_table).filter(
767
+ cluster_event_table.c.transitioned_at < time.time() -
768
+ retention_hours * 3600)
769
+ logger.debug(f'Deleting {query.count()} cluster events.')
770
+ query.delete()
771
+ session.commit()
772
+
773
+
774
+ async def cluster_event_retention_daemon():
775
+ """Garbage collect cluster events periodically."""
776
+ while True:
777
+ logger.info('Running cluster event retention daemon...')
778
+ # Use the latest config.
779
+ skypilot_config.reload_config()
780
+ retention_hours = skypilot_config.get_nested(
781
+ ('api_server', 'cluster_event_retention_hours'),
782
+ DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
783
+ try:
784
+ if retention_hours >= 0:
785
+ cleanup_cluster_events_with_retention(retention_hours)
786
+ except asyncio.CancelledError:
787
+ logger.info('Cluster event retention daemon cancelled')
788
+ break
789
+ except Exception as e: # pylint: disable=broad-except
790
+ logger.error(f'Error running cluster event retention daemon: {e}')
791
+
792
+ # Run daemon at most once every hour to avoid too frequent cleanup.
793
+ sleep_amount = max(retention_hours * 3600,
794
+ MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
795
+ await asyncio.sleep(sleep_amount)
796
+
797
+
734
798
  def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
735
799
  event_type: ClusterEventType) -> List[str]:
736
800
  """Returns the cluster events for the cluster.
@@ -793,11 +857,13 @@ def update_last_use(cluster_name: str):
793
857
 
794
858
 
795
859
  @_init_db
796
- def remove_cluster(cluster_name: str, terminate: bool) -> None:
860
+ def remove_cluster(cluster_name: str, terminate: bool,
861
+ remove_events: bool) -> None:
797
862
  """Removes cluster_name mapping."""
798
863
  assert _SQLALCHEMY_ENGINE is not None
799
864
  cluster_hash = _get_hash_for_existing_cluster(cluster_name)
800
865
  usage_intervals = _get_cluster_usage_intervals(cluster_hash)
866
+ provision_log_path = get_cluster_provision_log_path(cluster_name)
801
867
 
802
868
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
803
869
  # usage_intervals is not None and not empty
@@ -808,8 +874,19 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
808
874
  usage_intervals.append((start_time, end_time))
809
875
  _set_cluster_usage_intervals(cluster_hash, usage_intervals)
810
876
 
877
+ if provision_log_path:
878
+ assert cluster_hash is not None, cluster_name
879
+ session.query(cluster_history_table).filter_by(
880
+ cluster_hash=cluster_hash
881
+ ).filter(
882
+ cluster_history_table.c.provision_log_path.is_(None)
883
+ ).update({
884
+ cluster_history_table.c.provision_log_path: provision_log_path
885
+ })
886
+
811
887
  if terminate:
812
888
  session.query(cluster_table).filter_by(name=cluster_name).delete()
889
+ if remove_events:
813
890
  session.query(cluster_event_table).filter_by(
814
891
  cluster_hash=cluster_hash).delete()
815
892
  else:
@@ -915,6 +992,58 @@ def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
915
992
  return json.loads(row.metadata)
916
993
 
917
994
 
995
+ @_init_db
996
+ def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
997
+ """Returns provision_log_path from clusters table, if recorded."""
998
+ assert _SQLALCHEMY_ENGINE is not None
999
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1000
+ row = session.query(cluster_table).filter_by(name=cluster_name).first()
1001
+ if row is None:
1002
+ return None
1003
+ return getattr(row, 'provision_log_path', None)
1004
+
1005
+
1006
+ @_init_db
1007
+ def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
1008
+ """Returns provision_log_path from cluster_history for this name.
1009
+
1010
+ If the cluster currently exists, we use its hash. Otherwise, we look up
1011
+ historical rows by name and choose the most recent one based on
1012
+ usage_intervals.
1013
+ """
1014
+ assert _SQLALCHEMY_ENGINE is not None
1015
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1016
+ # Try current cluster first (fast path)
1017
+ cluster_hash = _get_hash_for_existing_cluster(cluster_name)
1018
+ if cluster_hash is not None:
1019
+ row = session.query(cluster_history_table).filter_by(
1020
+ cluster_hash=cluster_hash).first()
1021
+ if row is not None:
1022
+ return getattr(row, 'provision_log_path', None)
1023
+
1024
+ # Fallback: search history by name and pick the latest by
1025
+ # usage_intervals
1026
+ rows = session.query(cluster_history_table).filter_by(
1027
+ name=cluster_name).all()
1028
+ if not rows:
1029
+ return None
1030
+
1031
+ def latest_timestamp(usages_bin) -> int:
1032
+ try:
1033
+ intervals = pickle.loads(usages_bin)
1034
+ # intervals: List[Tuple[int, Optional[int]]]
1035
+ if not intervals:
1036
+ return -1
1037
+ _, end = intervals[-1]
1038
+ return end if end is not None else int(time.time())
1039
+ except Exception: # pylint: disable=broad-except
1040
+ return -1
1041
+
1042
+ latest_row = max(rows,
1043
+ key=lambda r: latest_timestamp(r.usage_intervals))
1044
+ return getattr(latest_row, 'provision_log_path', None)
1045
+
1046
+
918
1047
  @_init_db
919
1048
  def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
920
1049
  assert _SQLALCHEMY_ENGINE is not None
sky/jobs/constants.py CHANGED
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
47
47
  # The version of the lib files that jobs/utils use. Whenever there is an API
48
48
  # change for the jobs/utils, we need to bump this version and update
49
49
  # job.utils.ManagedJobCodeGen to handle the version update.
50
- MANAGED_JOBS_VERSION = 8
50
+ MANAGED_JOBS_VERSION = 9
51
51
 
52
52
  # The command for setting up the jobs dashboard on the controller. It firstly
53
53
  # checks if the systemd services are available, and if not (e.g., Kubernetes
@@ -261,9 +261,6 @@ class StrategyExecutor:
261
261
  if self.cluster_name is None:
262
262
  return
263
263
  if self.pool is None:
264
- global_user_state.add_cluster_event(
265
- self.cluster_name, None, 'Cluster was cleaned up.',
266
- global_user_state.ClusterEventType.STATUS_CHANGE)
267
264
  managed_job_utils.terminate_cluster(self.cluster_name)
268
265
 
269
266
  def _launch(self,
sky/jobs/scheduler.py CHANGED
@@ -93,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
93
93
  logger.debug(f'Job {job_id} started with pid {pid}')
94
94
 
95
95
 
96
- def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
96
+ def maybe_schedule_next_jobs() -> None:
97
97
  """Determine if any managed jobs can be scheduled, and if so, schedule them.
98
98
 
99
99
  Here, "schedule" means to select job that is waiting, and allow it to
@@ -139,7 +139,7 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
139
139
  with filelock.FileLock(controller_utils.get_resources_lock_path(),
140
140
  blocking=False):
141
141
  while True:
142
- maybe_next_job = state.get_waiting_job(pool)
142
+ maybe_next_job = state.get_waiting_job()
143
143
  if maybe_next_job is None:
144
144
  # Nothing left to start, break from scheduling loop
145
145
  break
@@ -158,22 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
158
158
  # an ALIVE_WAITING job, but we would be able to launch a WAITING
159
159
  # job.
160
160
  if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
161
- if not (controller_utils.can_provision() or
162
- actual_pool is not None):
161
+ if not controller_utils.can_provision():
163
162
  # Can't schedule anything, break from scheduling loop.
164
163
  break
165
164
  elif current_state == state.ManagedJobScheduleState.WAITING:
166
165
  if not _can_start_new_job(actual_pool):
167
- # If there is no job can be scheduled in the pool, we
168
- # try to schedule another job regardless of the pool.
169
- # This is to avoid the case where the pool is scaled
170
- # down at the same time as a job is done. In this case,
171
- # we won't have any job to schedule in the pool, but
172
- # other jobs in other pool (or no pool) can still be
173
- # scheduled.
174
- if pool is not None:
175
- pool = None
176
- continue
177
166
  # Can't schedule anything, break from scheduling loop.
178
167
  break
179
168
 
@@ -218,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
218
207
  if is_resume:
219
208
  _start_controller(job_id, dag_yaml_path, env_file_path, pool)
220
209
  else:
221
- maybe_schedule_next_jobs(pool)
210
+ maybe_schedule_next_jobs()
222
211
 
223
212
 
224
213
  @contextlib.contextmanager
@@ -243,6 +232,13 @@ def scheduled_launch(job_id: int):
243
232
  multiple uses of this context are nested, behavior is undefined. Don't do
244
233
  that.
245
234
  """
235
+ pool = state.get_pool_from_job_id(job_id)
236
+ # For pool, since there is no execution.launch, we don't need to have all
237
+ # the ALIVE_WAITING state. The state transition will be
238
+ # WAITING -> ALIVE -> DONE without any intermediate transitions.
239
+ if pool is not None:
240
+ yield
241
+ return
246
242
 
247
243
  # If we're already in LAUNCHING schedule_state, we don't need to wait.
248
244
  # This may be the case for the first launch of a job.
@@ -254,7 +250,6 @@ def scheduled_launch(job_id: int):
254
250
  while (state.get_job_schedule_state(job_id) !=
255
251
  state.ManagedJobScheduleState.LAUNCHING):
256
252
  time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
257
- pool = state.get_pool_from_job_id(job_id)
258
253
 
259
254
  try:
260
255
  yield
@@ -268,7 +263,7 @@ def scheduled_launch(job_id: int):
268
263
  with filelock.FileLock(controller_utils.get_resources_lock_path()):
269
264
  state.scheduler_set_alive(job_id)
270
265
  finally:
271
- maybe_schedule_next_jobs(pool)
266
+ maybe_schedule_next_jobs()
272
267
 
273
268
 
274
269
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -283,19 +278,17 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
283
278
  if idempotent and (state.get_job_schedule_state(job_id)
284
279
  == state.ManagedJobScheduleState.DONE):
285
280
  return
286
- pool = state.get_pool_from_job_id(job_id)
287
281
 
288
282
  with filelock.FileLock(controller_utils.get_resources_lock_path()):
289
283
  state.scheduler_set_done(job_id, idempotent)
290
- maybe_schedule_next_jobs(pool)
284
+ maybe_schedule_next_jobs()
291
285
 
292
286
 
293
287
  def _set_alive_waiting(job_id: int) -> None:
294
288
  """Should use wait_until_launch_okay() to transition to this state."""
295
289
  with filelock.FileLock(controller_utils.get_resources_lock_path()):
296
290
  state.scheduler_set_alive_waiting(job_id)
297
- pool = state.get_pool_from_job_id(job_id)
298
- maybe_schedule_next_jobs(pool)
291
+ maybe_schedule_next_jobs()
299
292
 
300
293
 
301
294
  def _can_start_new_job(pool: Optional[str]) -> bool:
sky/jobs/server/core.py CHANGED
@@ -497,7 +497,8 @@ def queue_from_kubernetes_pod(
497
497
  managed_jobs_runner = provision_lib.get_command_runners(
498
498
  'kubernetes', cluster_info)[0]
499
499
 
500
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
500
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
501
+ skip_finished=skip_finished)
501
502
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
502
503
  code,
503
504
  require_outputs=True,
@@ -513,7 +514,14 @@ def queue_from_kubernetes_pod(
513
514
  except exceptions.CommandError as e:
514
515
  raise RuntimeError(str(e)) from e
515
516
 
516
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
517
+ jobs, _, result_type = managed_job_utils.load_managed_job_queue(
518
+ job_table_payload)
519
+
520
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
521
+ return jobs
522
+
523
+ # Backward compatibility for old jobs controller without filtering
524
+ # TODO(hailong): remove this after 0.12.0
517
525
  if skip_finished:
518
526
  # Filter out the finished jobs. If a multi-task job is partially
519
527
  # finished, we will include all its tasks.
@@ -568,10 +576,18 @@ def _maybe_restart_controller(
568
576
 
569
577
 
570
578
  @usage_lib.entrypoint
571
- def queue(refresh: bool,
572
- skip_finished: bool = False,
573
- all_users: bool = False,
574
- job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
579
+ def queue(
580
+ refresh: bool,
581
+ skip_finished: bool = False,
582
+ all_users: bool = False,
583
+ job_ids: Optional[List[int]] = None,
584
+ user_match: Optional[str] = None,
585
+ workspace_match: Optional[str] = None,
586
+ name_match: Optional[str] = None,
587
+ pool_match: Optional[str] = None,
588
+ page: Optional[int] = None,
589
+ limit: Optional[int] = None,
590
+ ) -> Tuple[List[Dict[str, Any]], int]:
575
591
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
576
592
  """Gets statuses of managed jobs.
577
593
 
@@ -601,6 +617,17 @@ def queue(refresh: bool,
601
617
  does not exist.
602
618
  RuntimeError: if failed to get the managed jobs with ssh.
603
619
  """
620
+ if limit is not None:
621
+ if limit < 1:
622
+ raise ValueError(f'Limit must be at least 1, got {limit}')
623
+ if page is None:
624
+ page = 1
625
+ if page < 1:
626
+ raise ValueError(f'Page must be at least 1, got {page}')
627
+ else:
628
+ if page is not None:
629
+ raise ValueError('Limit must be specified when page is specified')
630
+
604
631
  handle = _maybe_restart_controller(refresh,
605
632
  stopped_message='No in-progress '
606
633
  'managed jobs.',
@@ -609,7 +636,22 @@ def queue(refresh: bool,
609
636
  backend = backend_utils.get_backend_from_handle(handle)
610
637
  assert isinstance(backend, backends.CloudVmRayBackend)
611
638
 
612
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
639
+ user_hashes: Optional[List[Optional[str]]] = None
640
+ if not all_users:
641
+ user_hashes = [common_utils.get_user_hash()]
642
+ # For backwards compatibility, we show jobs that do not have a
643
+ # user_hash. TODO(cooperc): Remove before 0.12.0.
644
+ user_hashes.append(None)
645
+ elif user_match is not None:
646
+ users = global_user_state.get_user_by_name_match(user_match)
647
+ if not users:
648
+ return [], 0
649
+ user_hashes = [user.id for user in users]
650
+
651
+ accessible_workspaces = list(workspaces_core.get_workspaces().keys())
652
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
653
+ skip_finished, accessible_workspaces, job_ids, workspace_match,
654
+ name_match, pool_match, page, limit, user_hashes)
613
655
  returncode, job_table_payload, stderr = backend.run_on_head(
614
656
  handle,
615
657
  code,
@@ -622,8 +664,14 @@ def queue(refresh: bool,
622
664
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
623
665
  f'{returncode}.\n{job_table_payload + stderr}')
624
666
 
625
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
667
+ jobs, total, result_type = managed_job_utils.load_managed_job_queue(
668
+ job_table_payload)
669
+
670
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
671
+ return jobs, total
626
672
 
673
+ # Backward compatibility for old jobs controller without filtering
674
+ # TODO(hailong): remove this after 0.12.0
627
675
  if not all_users:
628
676
 
629
677
  def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
@@ -636,7 +684,6 @@ def queue(refresh: bool,
636
684
 
637
685
  jobs = list(filter(user_hash_matches_or_missing, jobs))
638
686
 
639
- accessible_workspaces = workspaces_core.get_workspaces()
640
687
  jobs = list(
641
688
  filter(
642
689
  lambda job: job.get('workspace', skylet_constants.
@@ -655,7 +702,14 @@ def queue(refresh: bool,
655
702
  if job_ids:
656
703
  jobs = [job for job in jobs if job['job_id'] in job_ids]
657
704
 
658
- return jobs
705
+ return managed_job_utils.filter_jobs(jobs,
706
+ workspace_match,
707
+ name_match,
708
+ pool_match,
709
+ page=page,
710
+ limit=limit,
711
+ user_match=user_match,
712
+ enable_user_match=True)
659
713
 
660
714
 
661
715
  @usage_lib.entrypoint
sky/jobs/server/utils.py CHANGED
@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
62
62
  version_matches = controller_version == local_version
63
63
 
64
64
  # Load and filter jobs locally using existing method
65
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
65
+ jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
66
66
  non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
67
67
  has_non_terminal_jobs = len(non_terminal_jobs) > 0
68
68
 
sky/jobs/state.py CHANGED
@@ -1528,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
1528
1528
 
1529
1529
 
1530
1530
  @_init_db
1531
- def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
1531
+ def get_waiting_job() -> Optional[Dict[str, Any]]:
1532
1532
  """Get the next job that should transition to LAUNCHING.
1533
1533
 
1534
1534
  Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
@@ -1559,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
1559
1559
  job_info_table.c.priority >= sqlalchemy.func.coalesce(
1560
1560
  max_priority_subquery, 0),
1561
1561
  ]
1562
- if pool is not None:
1563
- select_conds.append(job_info_table.c.pool == pool)
1564
1562
  query = sqlalchemy.select(
1565
1563
  job_info_table.c.spot_job_id,
1566
1564
  job_info_table.c.schedule_state,