skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251029__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/coreweave.py +278 -0
  3. sky/backends/backend_utils.py +9 -6
  4. sky/backends/cloud_vm_ray_backend.py +2 -3
  5. sky/check.py +25 -13
  6. sky/client/cli/command.py +5 -1
  7. sky/cloud_stores.py +73 -0
  8. sky/core.py +7 -5
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → DabuSAKsc_y0wyJxpTIdQ}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +26 -0
  13. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-485984ca04e021d0.js} +1 -1
  26. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  27. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  28. sky/dashboard/out/clusters/[cluster].html +1 -1
  29. sky/dashboard/out/clusters.html +1 -1
  30. sky/dashboard/out/config.html +1 -1
  31. sky/dashboard/out/index.html +1 -1
  32. sky/dashboard/out/infra/[context].html +1 -1
  33. sky/dashboard/out/infra.html +1 -1
  34. sky/dashboard/out/jobs/[job].html +1 -1
  35. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/dashboard/out/users.html +1 -1
  38. sky/dashboard/out/volumes.html +1 -1
  39. sky/dashboard/out/workspace/new.html +1 -1
  40. sky/dashboard/out/workspaces/[name].html +1 -1
  41. sky/dashboard/out/workspaces.html +1 -1
  42. sky/data/data_utils.py +92 -1
  43. sky/data/mounting_utils.py +39 -0
  44. sky/data/storage.py +166 -9
  45. sky/global_user_state.py +14 -18
  46. sky/jobs/server/server.py +2 -2
  47. sky/jobs/utils.py +5 -6
  48. sky/optimizer.py +1 -1
  49. sky/provision/kubernetes/instance.py +88 -19
  50. sky/provision/kubernetes/volume.py +2 -2
  51. sky/schemas/api/responses.py +2 -5
  52. sky/serve/replica_managers.py +2 -2
  53. sky/serve/serve_utils.py +9 -2
  54. sky/server/requests/payloads.py +2 -0
  55. sky/server/requests/requests.py +137 -102
  56. sky/server/requests/serializers/decoders.py +0 -6
  57. sky/server/requests/serializers/encoders.py +33 -6
  58. sky/server/server.py +2 -1
  59. sky/server/stream_utils.py +56 -13
  60. sky/setup_files/dependencies.py +2 -0
  61. sky/task.py +10 -0
  62. sky/templates/nebius-ray.yml.j2 +1 -0
  63. sky/utils/cli_utils/status_utils.py +8 -2
  64. sky/utils/context_utils.py +13 -1
  65. sky/utils/resources_utils.py +53 -29
  66. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/METADATA +52 -36
  67. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/RECORD +73 -72
  68. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  69. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  70. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  73. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  75. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  76. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → DabuSAKsc_y0wyJxpTIdQ}/_ssgManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  78. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/top_level.txt +0 -0
@@ -90,7 +90,7 @@ class StatusResponse(ResponseBaseModel):
90
90
  # This is an internally facing field anyway, so it's less
91
91
  # of a problem that it's not typed.
92
92
  handle: Optional[Any] = None
93
- last_use: str
93
+ last_use: Optional[str] = None
94
94
  status: status_lib.ClusterStatus
95
95
  autostop: int
96
96
  to_down: bool
@@ -98,11 +98,8 @@ class StatusResponse(ResponseBaseModel):
98
98
  # metadata is a JSON, so we use Any here.
99
99
  metadata: Optional[Dict[str, Any]] = None
100
100
  cluster_hash: str
101
- # pydantic cannot generate the pydantic-core schema for
102
- # storage_mounts_metadata, so we use Any here.
103
- storage_mounts_metadata: Optional[Dict[str, Any]] = None
104
101
  cluster_ever_up: bool
105
- status_updated_at: int
102
+ status_updated_at: Optional[int] = None
106
103
  user_hash: str
107
104
  user_name: str
108
105
  config_hash: Optional[str] = None
@@ -495,8 +495,8 @@ class ReplicaInfo:
495
495
  info_dict['cloud'] = repr(handle.launched_resources.cloud)
496
496
  info_dict['region'] = handle.launched_resources.region
497
497
  info_dict['resources_str'] = (
498
- resources_utils.get_readable_resources_repr(handle,
499
- simplify=True))
498
+ resources_utils.get_readable_resources_repr(
499
+ handle, simplified_only=True)[0])
500
500
  return info_dict
501
501
 
502
502
  def __repr__(self) -> str:
sky/serve/serve_utils.py CHANGED
@@ -1550,8 +1550,15 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1550
1550
  'handle']
1551
1551
  if replica_handle is not None:
1552
1552
  infra = replica_handle.launched_resources.infra.formatted_str()
1553
- resources_str = resources_utils.get_readable_resources_repr(
1554
- replica_handle, simplify=not show_all)
1553
+ simplified = not show_all
1554
+ resources_str_simple, resources_str_full = (
1555
+ resources_utils.get_readable_resources_repr(
1556
+ replica_handle, simplified_only=simplified))
1557
+ if simplified:
1558
+ resources_str = resources_str_simple
1559
+ else:
1560
+ assert resources_str_full is not None
1561
+ resources_str = resources_str_full
1555
1562
 
1556
1563
  replica_values = [
1557
1564
  service_name,
@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
319
319
  # Only return fields that are needed for the
320
320
  # dashboard / CLI summary response
321
321
  summary_response: bool = False
322
+ # Include the cluster handle in the response
323
+ include_handle: bool = True
322
324
 
323
325
 
324
326
  class StartBody(RequestBody):
@@ -5,7 +5,6 @@ import contextlib
5
5
  import dataclasses
6
6
  import enum
7
7
  import functools
8
- import json
9
8
  import os
10
9
  import pathlib
11
10
  import shutil
@@ -21,6 +20,7 @@ import uuid
21
20
  import anyio
22
21
  import colorama
23
22
  import filelock
23
+ import orjson
24
24
 
25
25
  from sky import exceptions
26
26
  from sky import global_user_state
@@ -213,8 +213,8 @@ class Request:
213
213
  entrypoint=self.entrypoint.__name__,
214
214
  request_body=self.request_body.model_dump_json(),
215
215
  status=self.status.value,
216
- return_value=json.dumps(None),
217
- error=json.dumps(None),
216
+ return_value=orjson.dumps(None).decode('utf-8'),
217
+ error=orjson.dumps(None).decode('utf-8'),
218
218
  pid=None,
219
219
  created_at=self.created_at,
220
220
  schedule_type=self.schedule_type.value,
@@ -237,8 +237,8 @@ class Request:
237
237
  entrypoint=encoders.pickle_and_encode(self.entrypoint),
238
238
  request_body=encoders.pickle_and_encode(self.request_body),
239
239
  status=self.status.value,
240
- return_value=json.dumps(self.return_value),
241
- error=json.dumps(self.error),
240
+ return_value=orjson.dumps(self.return_value).decode('utf-8'),
241
+ error=orjson.dumps(self.error).decode('utf-8'),
242
242
  pid=self.pid,
243
243
  created_at=self.created_at,
244
244
  schedule_type=self.schedule_type.value,
@@ -270,8 +270,8 @@ class Request:
270
270
  entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
271
271
  request_body=decoders.decode_and_unpickle(payload.request_body),
272
272
  status=RequestStatus(payload.status),
273
- return_value=json.loads(payload.return_value),
274
- error=json.loads(payload.error),
273
+ return_value=orjson.loads(payload.return_value),
274
+ error=orjson.loads(payload.error),
275
275
  pid=payload.pid,
276
276
  created_at=payload.created_at,
277
277
  schedule_type=ScheduleType(payload.schedule_type),
@@ -328,10 +328,11 @@ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
328
328
  entrypoint=request.entrypoint.__name__
329
329
  if request.entrypoint is not None else '',
330
330
  request_body=request.request_body.model_dump_json()
331
- if request.request_body is not None else json.dumps(None),
331
+ if request.request_body is not None else
332
+ orjson.dumps(None).decode('utf-8'),
332
333
  status=request.status.value,
333
- return_value=json.dumps(None),
334
- error=json.dumps(None),
334
+ return_value=orjson.dumps(None).decode('utf-8'),
335
+ error=orjson.dumps(None).decode('utf-8'),
335
336
  pid=None,
336
337
  created_at=request.created_at,
337
338
  schedule_type=request.schedule_type.value,
@@ -372,9 +373,9 @@ def _update_request_row_fields(
372
373
  if 'user_id' not in fields:
373
374
  content['user_id'] = ''
374
375
  if 'return_value' not in fields:
375
- content['return_value'] = json.dumps(None)
376
+ content['return_value'] = orjson.dumps(None).decode('utf-8')
376
377
  if 'error' not in fields:
377
- content['error'] = json.dumps(None)
378
+ content['error'] = orjson.dumps(None).decode('utf-8')
378
379
  if 'schedule_type' not in fields:
379
380
  content['schedule_type'] = ScheduleType.SHORT.value
380
381
  # Optional fields in RequestPayload
@@ -393,94 +394,6 @@ def _update_request_row_fields(
393
394
  return tuple(content[col] for col in REQUEST_COLUMNS)
394
395
 
395
396
 
396
- def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
397
- """Kill all pending and running requests for a cluster.
398
-
399
- Args:
400
- cluster_name: the name of the cluster.
401
- exclude_request_names: exclude requests with these names. This is to
402
- prevent killing the caller request.
403
- """
404
- request_ids = [
405
- request_task.request_id
406
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
407
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
408
- exclude_request_names=[exclude_request_name],
409
- cluster_names=[cluster_name],
410
- fields=['request_id']))
411
- ]
412
- kill_requests(request_ids)
413
-
414
-
415
- def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
416
- user_id: Optional[str] = None) -> List[str]:
417
- """Kill requests with a given request ID prefix."""
418
- expanded_request_ids: Optional[List[str]] = None
419
- if request_ids is not None:
420
- expanded_request_ids = []
421
- for request_id in request_ids:
422
- request_tasks = get_requests_with_prefix(request_id,
423
- fields=['request_id'])
424
- if request_tasks is None or len(request_tasks) == 0:
425
- continue
426
- if len(request_tasks) > 1:
427
- raise ValueError(f'Multiple requests found for '
428
- f'request ID prefix: {request_id}')
429
- expanded_request_ids.append(request_tasks[0].request_id)
430
- return kill_requests(request_ids=expanded_request_ids, user_id=user_id)
431
-
432
-
433
- def kill_requests(request_ids: Optional[List[str]] = None,
434
- user_id: Optional[str] = None) -> List[str]:
435
- """Kill a SkyPilot API request and set its status to cancelled.
436
-
437
- Args:
438
- request_ids: The request IDs to kill. If None, all requests for the
439
- user are killed.
440
- user_id: The user ID to kill requests for. If None, all users are
441
- killed.
442
-
443
- Returns:
444
- A list of request IDs that were cancelled.
445
- """
446
- if request_ids is None:
447
- request_ids = [
448
- request_task.request_id
449
- for request_task in get_request_tasks(req_filter=RequestTaskFilter(
450
- status=[RequestStatus.PENDING, RequestStatus.RUNNING],
451
- # Avoid cancelling the cancel request itself.
452
- exclude_request_names=['sky.api_cancel'],
453
- user_id=user_id,
454
- fields=['request_id']))
455
- ]
456
- cancelled_request_ids = []
457
- for request_id in request_ids:
458
- with update_request(request_id) as request_record:
459
- if request_record is None:
460
- logger.debug(f'No request ID {request_id}')
461
- continue
462
- # Skip internal requests. The internal requests are scheduled with
463
- # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
464
- if request_record.request_id in set(
465
- event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
466
- continue
467
- if request_record.status > RequestStatus.RUNNING:
468
- logger.debug(f'Request {request_id} already finished')
469
- continue
470
- if request_record.pid is not None:
471
- logger.debug(f'Killing request process {request_record.pid}')
472
- # Use SIGTERM instead of SIGKILL:
473
- # - The executor can handle SIGTERM gracefully
474
- # - After SIGTERM, the executor can reuse the request process
475
- # for other requests, avoiding the overhead of forking a new
476
- # process for each request.
477
- os.kill(request_record.pid, signal.SIGTERM)
478
- request_record.status = RequestStatus.CANCELLED
479
- request_record.finished_at = time.time()
480
- cancelled_request_ids.append(request_id)
481
- return cancelled_request_ids
482
-
483
-
484
397
  def create_table(cursor, conn):
485
398
  # Enable WAL mode to avoid locking issues.
486
399
  # See: issue #1441 and PR #1509
@@ -624,6 +537,128 @@ def request_lock_path(request_id: str) -> str:
624
537
  return os.path.join(lock_path, f'.{request_id}.lock')
625
538
 
626
539
 
540
+ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
541
+ """Kill all pending and running requests for a cluster.
542
+
543
+ Args:
544
+ cluster_name: the name of the cluster.
545
+ exclude_request_names: exclude requests with these names. This is to
546
+ prevent killing the caller request.
547
+ """
548
+ request_ids = [
549
+ request_task.request_id
550
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
551
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
552
+ exclude_request_names=[exclude_request_name],
553
+ cluster_names=[cluster_name],
554
+ fields=['request_id']))
555
+ ]
556
+ _kill_requests(request_ids)
557
+
558
+
559
+ def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
560
+ user_id: Optional[str] = None) -> List[str]:
561
+ """Kill requests with a given request ID prefix."""
562
+ expanded_request_ids: Optional[List[str]] = None
563
+ if request_ids is not None:
564
+ expanded_request_ids = []
565
+ for request_id in request_ids:
566
+ request_tasks = get_requests_with_prefix(request_id,
567
+ fields=['request_id'])
568
+ if request_tasks is None or len(request_tasks) == 0:
569
+ continue
570
+ if len(request_tasks) > 1:
571
+ raise ValueError(f'Multiple requests found for '
572
+ f'request ID prefix: {request_id}')
573
+ expanded_request_ids.append(request_tasks[0].request_id)
574
+ return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
575
+
576
+
577
+ def _should_kill_request(request_id: str,
578
+ request_record: Optional[Request]) -> bool:
579
+ if request_record is None:
580
+ logger.debug(f'No request ID {request_id}')
581
+ return False
582
+ # Skip internal requests. The internal requests are scheduled with
583
+ # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
584
+ if request_record.request_id in set(
585
+ event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
586
+ return False
587
+ if request_record.status > RequestStatus.RUNNING:
588
+ logger.debug(f'Request {request_id} already finished')
589
+ return False
590
+ return True
591
+
592
+
593
+ def _kill_requests(request_ids: Optional[List[str]] = None,
594
+ user_id: Optional[str] = None) -> List[str]:
595
+ """Kill a SkyPilot API request and set its status to cancelled.
596
+
597
+ Args:
598
+ request_ids: The request IDs to kill. If None, all requests for the
599
+ user are killed.
600
+ user_id: The user ID to kill requests for. If None, all users are
601
+ killed.
602
+
603
+ Returns:
604
+ A list of request IDs that were cancelled.
605
+ """
606
+ if request_ids is None:
607
+ request_ids = [
608
+ request_task.request_id
609
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
610
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
611
+ # Avoid cancelling the cancel request itself.
612
+ exclude_request_names=['sky.api_cancel'],
613
+ user_id=user_id,
614
+ fields=['request_id']))
615
+ ]
616
+ cancelled_request_ids = []
617
+ for request_id in request_ids:
618
+ with update_request(request_id) as request_record:
619
+ if not _should_kill_request(request_id, request_record):
620
+ continue
621
+ if request_record.pid is not None:
622
+ logger.debug(f'Killing request process {request_record.pid}')
623
+ # Use SIGTERM instead of SIGKILL:
624
+ # - The executor can handle SIGTERM gracefully
625
+ # - After SIGTERM, the executor can reuse the request process
626
+ # for other requests, avoiding the overhead of forking a new
627
+ # process for each request.
628
+ os.kill(request_record.pid, signal.SIGTERM)
629
+ request_record.status = RequestStatus.CANCELLED
630
+ request_record.finished_at = time.time()
631
+ cancelled_request_ids.append(request_id)
632
+ return cancelled_request_ids
633
+
634
+
635
+ @init_db_async
636
+ @asyncio_utils.shield
637
+ async def kill_request_async(request_id: str) -> bool:
638
+ """Kill a SkyPilot API request and set its status to cancelled.
639
+
640
+ Returns:
641
+ True if the request was killed, False otherwise.
642
+ """
643
+ async with filelock.AsyncFileLock(request_lock_path(request_id)):
644
+ request = await _get_request_no_lock_async(request_id)
645
+ if not _should_kill_request(request_id, request):
646
+ return False
647
+ assert request is not None
648
+ if request.pid is not None:
649
+ logger.debug(f'Killing request process {request.pid}')
650
+ # Use SIGTERM instead of SIGKILL:
651
+ # - The executor can handle SIGTERM gracefully
652
+ # - After SIGTERM, the executor can reuse the request process
653
+ # for other requests, avoiding the overhead of forking a new
654
+ # process for each request.
655
+ os.kill(request.pid, signal.SIGTERM)
656
+ request.status = RequestStatus.CANCELLED
657
+ request.finished_at = time.time()
658
+ await _add_or_update_request_no_lock_async(request)
659
+ return True
660
+
661
+
627
662
  @contextlib.contextmanager
628
663
  @init_db
629
664
  @metrics_lib.time_me
@@ -638,7 +673,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
638
673
  _add_or_update_request_no_lock(request)
639
674
 
640
675
 
641
- @init_db
676
+ @init_db_async
642
677
  @metrics_lib.time_me
643
678
  @asyncio_utils.shield
644
679
  async def update_status_async(request_id: str, status: RequestStatus) -> None:
@@ -650,7 +685,7 @@ async def update_status_async(request_id: str, status: RequestStatus) -> None:
650
685
  await _add_or_update_request_no_lock_async(request)
651
686
 
652
687
 
653
- @init_db
688
+ @init_db_async
654
689
  @metrics_lib.time_me
655
690
  @asyncio_utils.shield
656
691
  async def update_status_msg_async(request_id: str, status_msg: str) -> None:
@@ -60,12 +60,6 @@ def decode_status(
60
60
  if 'handle' in cluster and cluster['handle'] is not None:
61
61
  cluster['handle'] = decode_and_unpickle(cluster['handle'])
62
62
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
63
- # this field is to be deprecated in the future.
64
- # do not decode this field if it is not present.
65
- if ('storage_mounts_metadata' in cluster and
66
- cluster['storage_mounts_metadata'] is not None):
67
- cluster['storage_mounts_metadata'] = decode_and_unpickle(
68
- cluster['storage_mounts_metadata'])
69
63
  if 'is_managed' not in cluster:
70
64
  cluster['is_managed'] = False
71
65
  response.append(responses.StatusResponse.model_validate(cluster))
@@ -60,13 +60,23 @@ def encode_status(
60
60
  clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
61
61
  response = []
62
62
  for cluster in clusters:
63
- response_cluster = cluster.model_dump()
63
+ response_cluster = cluster.model_dump(exclude_none=True)
64
+ # These default setting is needed because last_use and status_updated_at
65
+ # used to be not optional.
66
+ # TODO(syang): remove this after v0.10.7 or v0.11.0
67
+ if 'last_use' not in response_cluster:
68
+ response_cluster['last_use'] = ''
69
+ if 'status_updated_at' not in response_cluster:
70
+ response_cluster['status_updated_at'] = 0
64
71
  response_cluster['status'] = cluster['status'].value
65
72
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
66
73
  cluster['handle'])
67
74
  response_cluster['handle'] = pickle_and_encode(handle)
75
+ # TODO (syang) We still need to return this field for backwards
76
+ # compatibility.
77
+ # Remove this field at or after v0.10.7 or v0.11.0
68
78
  response_cluster['storage_mounts_metadata'] = pickle_and_encode(
69
- response_cluster['storage_mounts_metadata'])
79
+ None) # Always returns None.
70
80
  response.append(response_cluster)
71
81
  return response
72
82
 
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
206
216
  @register_encoder('storage_ls')
207
217
  def encode_storage_ls(
208
218
  return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
209
- for storage_info in return_value:
219
+ response_list = [storage_info.model_dump() for storage_info in return_value]
220
+ for storage_info in response_list:
210
221
  storage_info['status'] = storage_info['status'].value
211
222
  storage_info['store'] = [store.value for store in storage_info['store']]
212
- return [storage_info.model_dump() for storage_info in return_value]
223
+ return response_list
213
224
 
214
225
 
215
226
  @register_encoder('volume_list')
@@ -219,11 +230,11 @@ def encode_volume_list(
219
230
 
220
231
 
221
232
  @register_encoder('job_status')
222
- def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
233
+ def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
223
234
  for job_id in return_value.keys():
224
235
  if return_value[job_id] is not None:
225
236
  return_value[job_id] = return_value[job_id].value
226
- return return_value
237
+ return {str(k): v for k, v in return_value.items()}
227
238
 
228
239
 
229
240
  @register_encoder('kubernetes_node_info')
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
235
246
  @register_encoder('endpoints')
236
247
  def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
237
248
  return {str(k): v for k, v in return_value.items()}
249
+
250
+
251
+ @register_encoder('realtime_kubernetes_gpu_availability')
252
+ def encode_realtime_gpu_availability(
253
+ return_value: List[Tuple[str,
254
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
255
+ # Convert RealtimeGpuAvailability namedtuples to lists
256
+ # for JSON serialization.
257
+ result = []
258
+ for context, gpu_list in return_value:
259
+ gpu_availability_list = []
260
+ for gpu in gpu_list:
261
+ gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
262
+ gpu_availability_list.append(gpu_list_item)
263
+ result.append((context, gpu_availability_list))
264
+ return result
sky/server/server.py CHANGED
@@ -25,6 +25,7 @@ import zipfile
25
25
  import aiofiles
26
26
  import anyio
27
27
  import fastapi
28
+ from fastapi import responses as fastapi_responses
28
29
  from fastapi.middleware import cors
29
30
  import starlette.middleware.base
30
31
  import uvloop
@@ -1512,7 +1513,7 @@ async def get_expanded_request_id(request_id: str) -> str:
1512
1513
 
1513
1514
 
1514
1515
  # === API server related APIs ===
1515
- @app.get('/api/get')
1516
+ @app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
1516
1517
  async def api_get(request_id: str) -> payloads.RequestPayload:
1517
1518
  """Gets a request with a given request ID prefix."""
1518
1519
  # Validate request_id prefix matches a single request.
@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
25
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
26
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
27
27
  _HEARTBEAT_INTERVAL = 30
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
28
30
  # If a SHORT request has been stuck in pending for
29
31
  # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
30
32
  _SHORT_REQUEST_SPINNER_TIMEOUT = 2
@@ -235,6 +237,9 @@ async def _tail_log_file(
235
237
  buffer_bytes = 0
236
238
  last_flush_time = asyncio.get_event_loop().time()
237
239
 
240
+ # Read file in chunks instead of line-by-line for better performance
241
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
242
+
238
243
  async def flush_buffer() -> AsyncGenerator[str, None]:
239
244
  nonlocal buffer, buffer_bytes, last_flush_time
240
245
  if buffer:
@@ -255,8 +260,23 @@ async def _tail_log_file(
255
260
  async for chunk in flush_buffer():
256
261
  yield chunk
257
262
 
258
- line: Optional[bytes] = await f.readline()
259
- if not line:
263
+ # Read file in chunks for better I/O performance
264
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
265
+ if not file_chunk:
266
+ # Process any remaining incomplete line
267
+ if incomplete_line:
268
+ line_str = incomplete_line.decode('utf-8')
269
+ if plain_logs:
270
+ is_payload, line_str = message_utils.decode_payload(
271
+ line_str, raise_for_mismatch=False)
272
+ if not is_payload:
273
+ buffer.append(line_str)
274
+ buffer_bytes += len(line_str.encode('utf-8'))
275
+ else:
276
+ buffer.append(line_str)
277
+ buffer_bytes += len(line_str.encode('utf-8'))
278
+ incomplete_line = b''
279
+
260
280
  # Avoid checking the status too frequently to avoid overloading the
261
281
  # DB.
262
282
  should_check_status = (current_time -
@@ -328,16 +348,39 @@ async def _tail_log_file(
328
348
  # performance but it helps avoid unnecessary heartbeat strings
329
349
  # being printed when the client runs in an old version.
330
350
  last_heartbeat_time = asyncio.get_event_loop().time()
331
- line_str = line.decode('utf-8')
332
- if plain_logs:
333
- is_payload, line_str = message_utils.decode_payload(
334
- line_str, raise_for_mismatch=False)
335
- # TODO(aylei): implement heartbeat mechanism for plain logs,
336
- # sending invisible characters might be okay.
337
- if is_payload:
338
- continue
339
- buffer.append(line_str)
340
- buffer_bytes += len(line_str.encode('utf-8'))
351
+
352
+ # Combine with any incomplete line from previous chunk
353
+ file_chunk = incomplete_line + file_chunk
354
+ incomplete_line = b''
355
+
356
+ # Split chunk into lines, preserving line structure
357
+ lines_bytes = file_chunk.split(b'\n')
358
+
359
+ # If chunk doesn't end with newline, the last element is incomplete
360
+ if file_chunk and not file_chunk.endswith(b'\n'):
361
+ incomplete_line = lines_bytes[-1]
362
+ lines_bytes = lines_bytes[:-1]
363
+ else:
364
+ # If ends with \n, split creates an empty last element we should
365
+ # ignore
366
+ if lines_bytes and lines_bytes[-1] == b'':
367
+ lines_bytes = lines_bytes[:-1]
368
+
369
+ # Process all complete lines in this chunk
370
+ for line_bytes in lines_bytes:
371
+ # Reconstruct line with newline (since split removed it)
372
+ line_str = line_bytes.decode('utf-8') + '\n'
373
+
374
+ if plain_logs:
375
+ is_payload, line_str = message_utils.decode_payload(
376
+ line_str, raise_for_mismatch=False)
377
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
378
+ # sending invisible characters might be okay.
379
+ if is_payload:
380
+ continue
381
+
382
+ buffer.append(line_str)
383
+ buffer_bytes += len(line_str.encode('utf-8'))
341
384
 
342
385
  # Flush remaining lines in the buffer.
343
386
  async for chunk in flush_buffer():
@@ -373,7 +416,7 @@ def stream_response(
373
416
  async def on_disconnect():
374
417
  logger.info(f'User terminated the connection for request '
375
418
  f'{request_id}')
376
- requests_lib.kill_requests([request_id])
419
+ await requests_lib.kill_request_async(request_id)
377
420
 
378
421
  # The background task will be run after returning a response.
379
422
  # https://fastapi.tiangolo.com/tutorial/background-tasks/
@@ -49,6 +49,7 @@ install_requires = [
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
51
  'ijson',
52
+ 'orjson',
52
53
  'requests',
53
54
  # SkyPilot inherits from uvicorn.Server to customize the behavior of
54
55
  # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
187
188
  'docker': ['docker'] + local_ray,
188
189
  'lambda': [], # No dependencies needed for lambda
189
190
  'cloudflare': aws_dependencies,
191
+ 'coreweave': aws_dependencies,
190
192
  'scp': local_ray,
191
193
  'oci': ['oci'],
192
194
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
sky/task.py CHANGED
@@ -1552,6 +1552,16 @@ class Task:
1552
1552
  self.update_file_mounts({
1553
1553
  mnt_path: blob_path,
1554
1554
  })
1555
+ elif store_type is storage_lib.StoreType.COREWEAVE:
1556
+ if storage.source is not None and not isinstance(
1557
+ storage.source,
1558
+ list) and storage.source.startswith('cw://'):
1559
+ blob_path = storage.source
1560
+ else:
1561
+ blob_path = 'cw://' + storage.name
1562
+ self.update_file_mounts({
1563
+ mnt_path: blob_path,
1564
+ })
1555
1565
  else:
1556
1566
  with ux_utils.print_exception_no_traceback():
1557
1567
  raise ValueError(f'Storage Type {store_type} '
@@ -156,6 +156,7 @@ setup_commands:
156
156
  echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
157
157
  {%- endfor %}
158
158
  {%- endif %}
159
+ IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
159
160
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
160
161
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
161
162
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
282
282
  if resources_str_full is not None:
283
283
  resources_str = resources_str_full
284
284
  if resources_str is None:
285
- resources_str = resources_utils.get_readable_resources_repr(
286
- handle, simplify=truncate)
285
+ resources_str_simple, resources_str_full = (
286
+ resources_utils.get_readable_resources_repr(
287
+ handle, simplified_only=truncate))
288
+ if truncate:
289
+ resources_str = resources_str_simple
290
+ else:
291
+ assert resources_str_full is not None
292
+ resources_str = resources_str_full
287
293
 
288
294
  return resources_str
289
295
  return '-'