skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +148 -30
  5. sky/backends/cloud_vm_ray_backend.py +606 -223
  6. sky/catalog/__init__.py +7 -0
  7. sky/catalog/aws_catalog.py +4 -0
  8. sky/catalog/common.py +18 -0
  9. sky/catalog/data_fetchers/fetch_aws.py +13 -37
  10. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  11. sky/catalog/seeweb_catalog.py +184 -0
  12. sky/client/cli/command.py +2 -71
  13. sky/client/sdk_async.py +5 -2
  14. sky/clouds/__init__.py +2 -0
  15. sky/clouds/aws.py +23 -5
  16. sky/clouds/cloud.py +8 -0
  17. sky/clouds/kubernetes.py +2 -0
  18. sky/clouds/seeweb.py +463 -0
  19. sky/core.py +46 -12
  20. sky/dashboard/out/404.html +1 -1
  21. sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
  22. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  27. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  30. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
  31. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  32. sky/dashboard/out/clusters/[cluster].html +1 -1
  33. sky/dashboard/out/clusters.html +1 -1
  34. sky/dashboard/out/config.html +1 -1
  35. sky/dashboard/out/index.html +1 -1
  36. sky/dashboard/out/infra/[context].html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/exceptions.py +5 -0
  47. sky/global_user_state.py +75 -26
  48. sky/jobs/client/sdk_async.py +4 -2
  49. sky/jobs/controller.py +4 -2
  50. sky/jobs/recovery_strategy.py +1 -1
  51. sky/jobs/state.py +26 -16
  52. sky/jobs/utils.py +67 -24
  53. sky/logs/agent.py +10 -2
  54. sky/provision/__init__.py +1 -0
  55. sky/provision/kubernetes/config.py +7 -2
  56. sky/provision/kubernetes/instance.py +84 -41
  57. sky/provision/kubernetes/utils.py +14 -3
  58. sky/provision/seeweb/__init__.py +11 -0
  59. sky/provision/seeweb/config.py +13 -0
  60. sky/provision/seeweb/instance.py +806 -0
  61. sky/provision/vast/instance.py +1 -1
  62. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  63. sky/schemas/generated/jobsv1_pb2.py +86 -0
  64. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  65. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  66. sky/server/config.py +14 -5
  67. sky/server/metrics.py +41 -8
  68. sky/server/requests/executor.py +41 -4
  69. sky/server/server.py +1 -0
  70. sky/server/uvicorn.py +11 -5
  71. sky/setup_files/dependencies.py +8 -1
  72. sky/skylet/constants.py +14 -8
  73. sky/skylet/job_lib.py +128 -10
  74. sky/skylet/log_lib.py +14 -3
  75. sky/skylet/log_lib.pyi +9 -0
  76. sky/skylet/services.py +203 -0
  77. sky/skylet/skylet.py +4 -0
  78. sky/task.py +62 -0
  79. sky/templates/kubernetes-ray.yml.j2 +120 -3
  80. sky/templates/seeweb-ray.yml.j2 +108 -0
  81. sky/utils/accelerator_registry.py +3 -1
  82. sky/utils/command_runner.py +35 -11
  83. sky/utils/command_runner.pyi +22 -0
  84. sky/utils/context_utils.py +15 -2
  85. sky/utils/controller_utils.py +11 -5
  86. sky/utils/db/migration_utils.py +1 -1
  87. sky/utils/git.py +559 -1
  88. sky/utils/resource_checker.py +8 -7
  89. sky/workspaces/core.py +57 -21
  90. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
  91. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
  92. sky/client/cli/git.py +0 -549
  93. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  98. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  101. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
  102. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
  103. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
  104. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
  105. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ import time
31
31
  import typing
32
32
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
33
33
 
34
+ import psutil
34
35
  import setproctitle
35
36
 
36
37
  from sky import exceptions
@@ -130,8 +131,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
130
131
  def executor_initializer(proc_group: str):
131
132
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
132
133
  f'{multiprocessing.current_process().pid}')
134
+ # Executor never stops, unless the whole process is killed.
133
135
  threading.Thread(target=metrics_lib.process_monitor,
134
- args=(f'worker:{proc_group}',),
136
+ args=(f'worker:{proc_group}', threading.Event()),
135
137
  daemon=True).start()
136
138
 
137
139
 
@@ -373,11 +375,13 @@ def _request_execution_wrapper(request_id: str,
373
375
  4. Handle the SIGTERM signal to abort the request gracefully.
374
376
  5. Maintain the lifecycle of the temp dir used by the request.
375
377
  """
378
+ pid = multiprocessing.current_process().pid
379
+ proc = psutil.Process(pid)
380
+ rss_begin = proc.memory_info().rss
376
381
  db_utils.set_max_connections(num_db_connections_per_worker)
377
382
  # Handle the SIGTERM signal to abort the request processing gracefully.
378
383
  signal.signal(signal.SIGTERM, _sigterm_handler)
379
384
 
380
- pid = multiprocessing.current_process().pid
381
385
  logger.info(f'Running request {request_id} with pid {pid}')
382
386
  with api_requests.update_request(request_id) as request_task:
383
387
  assert request_task is not None, request_id
@@ -443,8 +447,41 @@ def _request_execution_wrapper(request_id: str,
443
447
  _restore_output(original_stdout, original_stderr)
444
448
  logger.info(f'Request {request_id} finished')
445
449
  finally:
446
- with metrics_lib.time_it(name='release_memory', group='internal'):
447
- common_utils.release_memory()
450
+ try:
451
+ # Capture the peak RSS before GC.
452
+ peak_rss = max(proc.memory_info().rss,
453
+ metrics_lib.peak_rss_bytes)
454
+ with metrics_lib.time_it(name='release_memory',
455
+ group='internal'):
456
+ common_utils.release_memory()
457
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
458
+ except Exception as e: # pylint: disable=broad-except
459
+ logger.error(f'Failed to record memory metrics: '
460
+ f'{common_utils.format_exception(e)}')
461
+
462
+
463
+ _first_request = True
464
+
465
+
466
+ def _record_memory_metrics(request_name: str, proc: psutil.Process,
467
+ rss_begin: int, peak_rss: int) -> None:
468
+ """Record the memory metrics for a request."""
469
+ # Do not record full memory delta for the first request as it
470
+ # will loads the sky core modules and make the memory usage
471
+ # estimation inaccurate.
472
+ global _first_request
473
+ if _first_request:
474
+ _first_request = False
475
+ return
476
+ rss_end = proc.memory_info().rss
477
+
478
+ # Answer "how much RSS this request contributed?"
479
+ metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
480
+ name=request_name).observe(max(rss_end - rss_begin, 0))
481
+ # Estimate the memory usage by the request by capturing the
482
+ # peak memory delta during the request execution.
483
+ metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
484
+ name=request_name).observe(max(peak_rss - rss_begin, 0))
448
485
 
449
486
 
450
487
  async def execute_request_coroutine(request: api_requests.Request):
sky/server/server.py CHANGED
@@ -1214,6 +1214,7 @@ async def logs(
1214
1214
  request_body=cluster_job_body,
1215
1215
  func=core.tail_logs,
1216
1216
  schedule_type=requests_lib.ScheduleType.SHORT,
1217
+ request_cluster_name=cluster_job_body.cluster_name,
1217
1218
  )
1218
1219
  task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1219
1220
 
sky/server/uvicorn.py CHANGED
@@ -213,11 +213,17 @@ class Server(uvicorn.Server):
213
213
  # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
214
214
  event_loop.set_debug(True)
215
215
  event_loop.slow_callback_duration = lag_threshold
216
- threading.Thread(target=metrics_lib.process_monitor,
217
- args=('server',),
218
- daemon=True).start()
219
- with self.capture_signals():
220
- asyncio.run(self.serve(*args, **kwargs))
216
+ stop_monitor = threading.Event()
217
+ monitor = threading.Thread(target=metrics_lib.process_monitor,
218
+ args=('server', stop_monitor),
219
+ daemon=True)
220
+ monitor.start()
221
+ try:
222
+ with self.capture_signals():
223
+ asyncio.run(self.serve(*args, **kwargs))
224
+ finally:
225
+ stop_monitor.set()
226
+ monitor.join()
221
227
 
222
228
 
223
229
  def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
@@ -110,7 +110,8 @@ server_dependencies = [
110
110
  local_ray = [
111
111
  # Lower version of ray will cause dependency conflict for
112
112
  # click/grpcio/protobuf.
113
- # Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
113
+ # Ray 2.6.1+ resolved cluster launcher bugs
114
+ # and grpcio issues on Apple Silicon.
114
115
  # https://github.com/ray-project/ray/releases/tag/ray-2.6.1
115
116
  'ray[default] >= 2.6.1',
116
117
  ]
@@ -200,9 +201,14 @@ extras_require: Dict[str, List[str]] = {
200
201
  # 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
201
202
  ],
202
203
  'nebius': [
204
+ # Nebius requires grpcio and protobuf, so we need to include
205
+ # our constraints here.
203
206
  'nebius>=0.2.47',
207
+ GRPC,
208
+ PROTOBUF,
204
209
  ] + aws_dependencies,
205
210
  'hyperbolic': [], # No dependencies needed for hyperbolic
211
+ 'seeweb': ['ecsapi>=0.2.0'],
206
212
  'server': server_dependencies,
207
213
  }
208
214
 
@@ -213,6 +219,7 @@ clouds_for_all.remove('remote')
213
219
  if sys.version_info < (3, 10):
214
220
  # Nebius needs python3.10. If python 3.9 [all] will not install nebius
215
221
  clouds_for_all.remove('nebius')
222
+ clouds_for_all.remove('seeweb')
216
223
 
217
224
  if sys.version_info >= (3, 12):
218
225
  # The version of ray we use does not work with >= 3.12, so avoid clouds
sky/skylet/constants.py CHANGED
@@ -62,11 +62,14 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
62
62
  'curl -LsSf https://astral.sh/uv/install.sh '
63
63
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
64
64
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
65
- # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
66
- # environment. `deactivate` command does not work when conda is used.
65
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run')
66
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
67
+ # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
68
+ # not work when conda is used.
67
69
  DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
68
70
  'export PATH='
69
- f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
71
+ f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||") && '
72
+ 'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
70
73
 
71
74
  # Prefix for SkyPilot environment variables
72
75
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
@@ -98,7 +101,7 @@ SKYLET_VERSION = '18'
98
101
  SKYLET_LIB_VERSION = 4
99
102
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
100
103
  SKYLET_GRPC_PORT = 46590
101
- SKYLET_GRPC_TIMEOUT_SECONDS = 5
104
+ SKYLET_GRPC_TIMEOUT_SECONDS = 10
102
105
 
103
106
  # Docker default options
104
107
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -229,7 +232,7 @@ RAY_INSTALLATION_COMMANDS = (
229
232
  'export PATH=$PATH:$HOME/.local/bin; '
230
233
  # Writes ray path to file if it does not exist or the file is empty.
231
234
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
232
- f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
235
+ f'{{ {SKY_UV_RUN_CMD} '
233
236
  f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
234
237
 
235
238
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
@@ -374,7 +377,6 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
374
377
  ('ssh', 'pod_config'),
375
378
  ('kubernetes', 'custom_metadata'),
376
379
  ('kubernetes', 'pod_config'),
377
- ('kubernetes', 'context_configs'),
378
380
  ('kubernetes', 'provision_timeout'),
379
381
  ('kubernetes', 'dws'),
380
382
  ('kubernetes', 'kueue'),
@@ -382,6 +384,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
382
384
  ('gcp', 'enable_gvnic'),
383
385
  ('gcp', 'enable_gpu_direct'),
384
386
  ('gcp', 'placement_policy'),
387
+ ('active_workspace',),
385
388
  ]
386
389
  # When overriding the SkyPilot configs on the API server with the client one,
387
390
  # we skip the following keys because they are meant to be client-side configs.
@@ -449,11 +452,11 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
449
452
  # BEGIN constants used for service catalog.
450
453
  HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
451
454
  HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
452
- CATALOG_SCHEMA_VERSION = 'v7'
455
+ CATALOG_SCHEMA_VERSION = 'v8'
453
456
  CATALOG_DIR = '~/.sky/catalogs'
454
457
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
455
458
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
456
- 'paperspace', 'do', 'nebius', 'ssh', 'hyperbolic')
459
+ 'paperspace', 'do', 'nebius', 'ssh', 'hyperbolic', 'seeweb')
457
460
  # END constants used for service catalog.
458
461
 
459
462
  # The user ID of the SkyPilot system.
@@ -510,3 +513,6 @@ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
510
513
 
511
514
  ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
512
515
  'DEBUG_LOOP_LAG_THRESHOLD_MS')
516
+
517
+ ARM64_ARCH = 'arm64'
518
+ X86_64_ARCH = 'x86_64'
sky/skylet/job_lib.py CHANGED
@@ -31,8 +31,11 @@ from sky.utils.db import db_utils
31
31
 
32
32
  if typing.TYPE_CHECKING:
33
33
  import psutil
34
+
35
+ from sky.schemas.generated import jobsv1_pb2
34
36
  else:
35
37
  psutil = adaptors_common.LazyImport('psutil')
38
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
36
39
 
37
40
  logger = sky_logging.init_logger(__name__)
38
41
 
@@ -220,6 +223,45 @@ class JobStatus(enum.Enum):
220
223
  color = _JOB_STATUS_TO_COLOR[self]
221
224
  return f'{color}{self.value}{colorama.Style.RESET_ALL}'
222
225
 
226
+ @classmethod
227
+ def from_protobuf(
228
+ cls,
229
+ protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
230
+ """Convert protobuf JobStatus enum to Python enum value."""
231
+ protobuf_to_enum = {
232
+ jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
233
+ jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
234
+ jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
235
+ jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
236
+ jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
237
+ jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
238
+ jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
239
+ jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
240
+ jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
241
+ jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
242
+ }
243
+ if protobuf_value not in protobuf_to_enum:
244
+ raise ValueError(
245
+ f'Unknown protobuf JobStatus value: {protobuf_value}')
246
+ return protobuf_to_enum[protobuf_value]
247
+
248
+ def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
249
+ """Convert this Python enum value to protobuf enum value."""
250
+ enum_to_protobuf = {
251
+ JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
252
+ JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
253
+ JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
254
+ JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
255
+ JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
256
+ JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
257
+ JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
258
+ JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
259
+ JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
260
+ }
261
+ if self not in enum_to_protobuf:
262
+ raise ValueError(f'Unknown JobStatus value: {self}')
263
+ return enum_to_protobuf[self]
264
+
223
265
 
224
266
  # We have two steps for job submissions:
225
267
  # 1. Client reserve a job id from the job table by adding a INIT state job.
@@ -475,6 +517,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
475
517
 
476
518
  @init_db
477
519
  def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
520
+ return message_utils.encode_payload(get_statuses(job_ids))
521
+
522
+
523
+ @init_db
524
+ def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
478
525
  assert _DB is not None
479
526
  # Per-job lock is not required here, since the staled job status will not
480
527
  # affect the caller.
@@ -482,10 +529,52 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
482
529
  rows = _DB.cursor.execute(
483
530
  f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
484
531
  job_ids)
485
- statuses = {job_id: None for job_id in job_ids}
532
+ statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
486
533
  for (job_id, status) in rows:
487
534
  statuses[job_id] = status
488
- return message_utils.encode_payload(statuses)
535
+ return statuses
536
+
537
+
538
+ @init_db
539
+ def get_jobs_info(user_hash: Optional[str] = None,
540
+ all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
541
+ """Get detailed job information.
542
+
543
+ Similar to dump_job_queue but returns structured protobuf objects instead
544
+ of encoded strings.
545
+
546
+ Args:
547
+ user_hash: The user hash to show jobs for. Show all the users if None.
548
+ all_jobs: Whether to show all jobs, not just the pending/running ones.
549
+ """
550
+ assert _DB is not None
551
+
552
+ status_list: Optional[List[JobStatus]] = [
553
+ JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
554
+ ]
555
+ if all_jobs:
556
+ status_list = None
557
+
558
+ jobs = _get_jobs(user_hash, status_list=status_list)
559
+ jobs_info = []
560
+ for job in jobs:
561
+ jobs_info.append(
562
+ jobsv1_pb2.JobInfo(
563
+ job_id=job['job_id'],
564
+ job_name=job['job_name'],
565
+ username=job['username'],
566
+ submitted_at=job['submitted_at'],
567
+ status=job['status'].to_protobuf(),
568
+ run_timestamp=job['run_timestamp'],
569
+ start_at=job['start_at']
570
+ if job['start_at'] is not None else -1.0,
571
+ end_at=job['end_at'] if job['end_at'] is not None else 0.0,
572
+ resources=job['resources'] or '',
573
+ pid=job['pid'],
574
+ log_path=os.path.join(constants.SKY_LOGS_DIRECTORY,
575
+ job['run_timestamp']),
576
+ metadata=json.dumps(job['metadata'])))
577
+ return jobs_info
489
578
 
490
579
 
491
580
  def load_statuses_payload(
@@ -527,13 +616,24 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
527
616
  `format_job_queue()`), because the job may stay in PENDING if the cluster is
528
617
  busy.
529
618
  """
619
+ return message_utils.encode_payload(
620
+ get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
621
+
622
+
623
+ @init_db
624
+ def get_job_submitted_or_ended_timestamp(
625
+ job_id: int, get_ended_time: bool) -> Optional[float]:
626
+ """Get the job submitted timestamp.
627
+
628
+ Returns the raw timestamp or None if job doesn't exist.
629
+ """
530
630
  assert _DB is not None
531
631
  field = 'end_at' if get_ended_time else 'submitted_at'
532
632
  rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
533
633
  (job_id,))
534
634
  for (timestamp,) in rows:
535
- return message_utils.encode_payload(timestamp)
536
- return message_utils.encode_payload(None)
635
+ return timestamp
636
+ return None
537
637
 
538
638
 
539
639
  def get_ray_port():
@@ -947,6 +1047,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
947
1047
  Encoded job IDs that are actually cancelled. Caller should use
948
1048
  message_utils.decode_payload() to parse.
949
1049
  """
1050
+ return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
1051
+ user_hash))
1052
+
1053
+
1054
+ def cancel_jobs(jobs: Optional[List[int]],
1055
+ cancel_all: bool = False,
1056
+ user_hash: Optional[str] = None) -> List[int]:
950
1057
  job_records = []
951
1058
  all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
952
1059
  if jobs is None and not cancel_all:
@@ -1010,7 +1117,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
1010
1117
  cancelled_ids.append(job['job_id'])
1011
1118
 
1012
1119
  scheduler.schedule_step()
1013
- return message_utils.encode_payload(cancelled_ids)
1120
+ return cancelled_ids
1014
1121
 
1015
1122
 
1016
1123
  @init_db
@@ -1030,6 +1137,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
1030
1137
 
1031
1138
  @init_db
1032
1139
  def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1140
+ """Returns the relative paths to the log files for jobs with globbing,
1141
+ encoded."""
1142
+ job_to_dir = get_job_log_dirs(job_ids)
1143
+ job_to_dir_str: Dict[str, str] = {}
1144
+ for job_id, log_dir in job_to_dir.items():
1145
+ job_to_dir_str[str(job_id)] = log_dir
1146
+ return message_utils.encode_payload(job_to_dir_str)
1147
+
1148
+
1149
+ @init_db
1150
+ def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
1033
1151
  """Returns the relative paths to the log files for jobs with globbing."""
1034
1152
  assert _DB is not None
1035
1153
  query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
@@ -1038,16 +1156,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
1038
1156
  SELECT * FROM jobs
1039
1157
  WHERE {query_str}""", job_ids)
1040
1158
  rows = _DB.cursor.fetchall()
1041
- job_to_dir = {}
1159
+ job_to_dir: Dict[int, str] = {}
1042
1160
  for row in rows:
1043
1161
  job_id = row[JobInfoLoc.JOB_ID.value]
1044
1162
  if row[JobInfoLoc.LOG_PATH.value]:
1045
- job_to_dir[str(job_id)] = row[JobInfoLoc.LOG_PATH.value]
1163
+ job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
1046
1164
  else:
1047
1165
  run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
1048
- job_to_dir[str(job_id)] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1049
- run_timestamp)
1050
- return message_utils.encode_payload(job_to_dir)
1166
+ job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
1167
+ run_timestamp)
1168
+ return job_to_dir
1051
1169
 
1052
1170
 
1053
1171
  class JobLibCodeGen:
sky/skylet/log_lib.py CHANGED
@@ -354,6 +354,17 @@ def run_bash_command_with_log(bash_command: str,
354
354
  shell=True)
355
355
 
356
356
 
357
+ def run_bash_command_with_log_and_return_pid(
358
+ bash_command: str,
359
+ log_path: str,
360
+ env_vars: Optional[Dict[str, str]] = None,
361
+ stream_logs: bool = False,
362
+ with_ray: bool = False):
363
+ return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
364
+ stream_logs, with_ray)
365
+ return {'return_code': return_code, 'pid': os.getpid()}
366
+
367
+
357
368
  def _follow_job_logs(file,
358
369
  job_id: int,
359
370
  start_streaming: bool,
@@ -395,9 +406,9 @@ def _follow_job_logs(file,
395
406
  wait_last_logs = False
396
407
  continue
397
408
  status_str = status.value if status is not None else 'None'
398
- print(ux_utils.finishing_message(
399
- f'Job finished (status: {status_str}).'),
400
- flush=True)
409
+ finish = ux_utils.finishing_message(
410
+ f'Job finished (status: {status_str}).')
411
+ yield finish + '\n'
401
412
  return
402
413
 
403
414
  time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
sky/skylet/log_lib.pyi CHANGED
@@ -129,6 +129,15 @@ def run_bash_command_with_log(bash_command: str,
129
129
  ...
130
130
 
131
131
 
132
+ def run_bash_command_with_log_and_return_pid(
133
+ bash_command: str,
134
+ log_path: str,
135
+ env_vars: Optional[Dict[str, str]] = ...,
136
+ stream_logs: bool = ...,
137
+ with_ray: bool = ...):
138
+ ...
139
+
140
+
132
141
  def tail_logs(job_id: int,
133
142
  log_dir: Optional[str],
134
143
  managed_job_id: Optional[int] = ...,