skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n*sky/schemas/generated/managed_jobsv1.proto\x12\x0fmanaged_jobs.v1\"\x15\n\x06JobIds\x12\x0b\n\x03ids\x18\x01 \x03(\x03\"\x1c\n\nUserHashes\x12\x0e\n\x06hashes\x18\x01 \x03(\t\"\x1c\n\x08Statuses\x12\x10\n\x08statuses\x18\x01 \x03(\t\"\x18\n\x06\x46ields\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\" \n\nWorkspaces\x12\x12\n\nworkspaces\x18\x01 \x03(\t\"\x13\n\x11GetVersionRequest\"0\n\x12GetVersionResponse\x12\x1a\n\x12\x63ontroller_version\x18\x01 \x01(\t\"\xe1\x04\n\x12GetJobTableRequest\x12\x15\n\rskip_finished\x18\x01 \x01(\x08\x12?\n\x15\x61\x63\x63\x65ssible_workspaces\x18\x02 \x01(\x0b\x32\x1b.managed_jobs.v1.WorkspacesH\x00\x88\x01\x01\x12-\n\x07job_ids\x18\x03 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x01\x88\x01\x01\x12\x1c\n\x0fworkspace_match\x18\x04 \x01(\tH\x02\x88\x01\x01\x12\x17\n\nname_match\x18\x05 \x01(\tH\x03\x88\x01\x01\x12\x17\n\npool_match\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x11\n\x04page\x18\x07 \x01(\x05H\x05\x88\x01\x01\x12\x12\n\x05limit\x18\x08 \x01(\x05H\x06\x88\x01\x01\x12\x35\n\x0buser_hashes\x18\t \x01(\x0b\x32\x1b.managed_jobs.v1.UserHashesH\x07\x88\x01\x01\x12\x30\n\x08statuses\x18\n \x01(\x0b\x32\x19.managed_jobs.v1.StatusesH\x08\x88\x01\x01\x12#\n\x1bshow_jobs_without_user_hash\x18\x0b \x01(\x08\x12,\n\x06\x66ields\x18\x0c \x01(\x0b\x32\x17.managed_jobs.v1.FieldsH\t\x88\x01\x01\x42\x18\n\x16_accessible_workspacesB\n\n\x08_job_idsB\x12\n\x10_workspace_matchB\r\n\x0b_name_matchB\r\n\x0b_pool_matchB\x07\n\x05_pageB\x08\n\x06_limitB\x0e\n\x0c_user_hashesB\x0b\n\t_statusesB\t\n\x07_fields\"\
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n*sky/schemas/generated/managed_jobsv1.proto\x12\x0fmanaged_jobs.v1\"\x15\n\x06JobIds\x12\x0b\n\x03ids\x18\x01 \x03(\x03\"\x1c\n\nUserHashes\x12\x0e\n\x06hashes\x18\x01 \x03(\t\"\x1c\n\x08Statuses\x12\x10\n\x08statuses\x18\x01 \x03(\t\"\x18\n\x06\x46ields\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\" \n\nWorkspaces\x12\x12\n\nworkspaces\x18\x01 \x03(\t\"\x13\n\x11GetVersionRequest\"0\n\x12GetVersionResponse\x12\x1a\n\x12\x63ontroller_version\x18\x01 \x01(\t\"\xe1\x04\n\x12GetJobTableRequest\x12\x15\n\rskip_finished\x18\x01 \x01(\x08\x12?\n\x15\x61\x63\x63\x65ssible_workspaces\x18\x02 \x01(\x0b\x32\x1b.managed_jobs.v1.WorkspacesH\x00\x88\x01\x01\x12-\n\x07job_ids\x18\x03 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x01\x88\x01\x01\x12\x1c\n\x0fworkspace_match\x18\x04 \x01(\tH\x02\x88\x01\x01\x12\x17\n\nname_match\x18\x05 \x01(\tH\x03\x88\x01\x01\x12\x17\n\npool_match\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x11\n\x04page\x18\x07 \x01(\x05H\x05\x88\x01\x01\x12\x12\n\x05limit\x18\x08 \x01(\x05H\x06\x88\x01\x01\x12\x35\n\x0buser_hashes\x18\t \x01(\x0b\x32\x1b.managed_jobs.v1.UserHashesH\x07\x88\x01\x01\x12\x30\n\x08statuses\x18\n \x01(\x0b\x32\x19.managed_jobs.v1.StatusesH\x08\x88\x01\x01\x12#\n\x1bshow_jobs_without_user_hash\x18\x0b \x01(\x08\x12,\n\x06\x66ields\x18\x0c \x01(\x0b\x32\x17.managed_jobs.v1.FieldsH\t\x88\x01\x01\x42\x18\n\x16_accessible_workspacesB\n\n\x08_job_idsB\x12\n\x10_workspace_matchB\r\n\x0b_name_matchB\r\n\x0b_pool_matchB\x07\n\x05_pageB\x08\n\x06_limitB\x0e\n\x0c_user_hashesB\x0b\n\t_statusesB\t\n\x07_fields\"\xb4\t\n\x0eManagedJobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07task_id\x18\x02 \x01(\x03\x12\x10\n\x08job_name\x18\x03 \x01(\t\x12\x11\n\ttask_name\x18\x04 \x01(\t\x12\x14\n\x0cjob_duration\x18\x05 \x01(\x01\x12\x16\n\tworkspace\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x31\n\x06status\x18\x07 \x01(\x0e\x32!.managed_jobs.v1.ManagedJobStatus\x12@\n\x0eschedule_state\x18\x08 \x01(\x0e\x32(.managed_jobs.v1.ManagedJobScheduleState\x12\x11\n\tresources\x18\t \x01(\t\x12\x19\n\x11\x63luster_resources\x18\n \x01(\t\x12\x1e\n\x16\x63luster_resources_full\x18\x0b \x01(\t\x12\r\n\x05\x63loud\x18\x0c \x01(\t\x12\x0e\n\x06region\x18\r \x01(\t\x12\r\n\x05infra\x18\x0e \x01(\t\x12G\n\x0c\x61\x63\x63\x65lerators\x18\x0f \x03(\x0b\x32\x31.managed_jobs.v1.ManagedJobInfo.AcceleratorsEntry\x12\x16\n\x0erecovery_count\x18\x10 \x01(\x05\x12\x14\n\x07\x64\x65tails\x18\x11 \x01(\tH\x01\x88\x01\x01\x12\x1b\n\x0e\x66\x61ilure_reason\x18\x12 \x01(\tH\x02\x88\x01\x01\x12\x16\n\tuser_name\x18\x13 \x01(\tH\x03\x88\x01\x01\x12\x16\n\tuser_hash\x18\x14 \x01(\tH\x04\x88\x01\x01\x12\x19\n\x0csubmitted_at\x18\x15 \x01(\x01H\x05\x88\x01\x01\x12\x15\n\x08start_at\x18\x16 \x01(\x01H\x06\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x17 \x01(\x01H\x07\x88\x01\x01\x12\x16\n\tuser_yaml\x18\x18 \x01(\tH\x08\x88\x01\x01\x12\x17\n\nentrypoint\x18\x19 \x01(\tH\t\x88\x01\x01\x12?\n\x08metadata\x18\x1a \x03(\x0b\x32-.managed_jobs.v1.ManagedJobInfo.MetadataEntry\x12\x11\n\x04pool\x18\x1b \x01(\tH\n\x88\x01\x01\x12\x16\n\tpool_hash\x18\x1c \x01(\tH\x0b\x88\x01\x01\x12\x14\n\x07_job_id\x18\x1d \x01(\x03H\x0c\x88\x01\x01\x12\x39\n\x05links\x18\x1e \x03(\x0b\x32*.managed_jobs.v1.ManagedJobInfo.LinksEntry\x1a\x33\n\x11\x41\x63\x63\x65leratorsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a,\n\nLinksEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\n_workspaceB\n\n\x08_detailsB\x11\n\x0f_failure_reasonB\x0c\n\n_user_nameB\x0c\n\n_user_hashB\x0f\n\r_submitted_atB\x0b\n\t_start_atB\t\n\x07_end_atB\x0c\n\n_user_yamlB\r\n\x0b_entrypointB\x07\n\x05_poolB\x0c\n\n_pool_hashB\n\n\x08X_job_id\"\xf0\x01\n\x13GetJobTableResponse\x12-\n\x04jobs\x18\x01 \x03(\x0b\x32\x1f.managed_jobs.v1.ManagedJobInfo\x12\r\n\x05total\x18\x02 \x01(\x05\x12\x17\n\x0ftotal_no_filter\x18\x03 \x01(\x05\x12M\n\rstatus_counts\x18\x04 \x03(\x0b\x32\x36.managed_jobs.v1.GetJobTableResponse.StatusCountsEntry\x1a\x33\n\x11StatusCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01\"?\n\x19GetAllJobIdsByNameRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0b\n\t_job_name\"-\n\x1aGetAllJobIdsByNameResponse\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xd7\x01\n\x11\x43\x61ncelJobsRequest\x12\x19\n\x11\x63urrent_workspace\x18\x01 \x01(\t\x12\x16\n\tuser_hash\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x13\n\tall_users\x18\x03 \x01(\x08H\x00\x12*\n\x07job_ids\x18\x04 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x00\x12\x12\n\x08job_name\x18\x05 \x01(\tH\x00\x12\x13\n\tpool_name\x18\x06 \x01(\tH\x00\x42\x17\n\x15\x63\x61ncellation_criteriaB\x0c\n\n_user_hash\"%\n\x12\x43\x61ncelJobsResponse\x12\x0f\n\x07message\x18\x01 \x01(\t\"\x97\x01\n\x11StreamLogsRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x06job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x12\n\ncontroller\x18\x04 \x01(\x08\x12\x11\n\x04tail\x18\x05 \x01(\x05H\x02\x88\x01\x01\x42\x0b\n\t_job_nameB\t\n\x07_job_idB\x07\n\x05_tail\"L\n\x12StreamLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x16\n\texit_code\x18\x02 \x01(\x05H\x00\x88\x01\x01\x42\x0c\n\n_exit_code*\x85\x04\n\x10ManagedJobStatus\x12\"\n\x1eMANAGED_JOB_STATUS_UNSPECIFIED\x10\x00\x12\x1e\n\x1aMANAGED_JOB_STATUS_PENDING\x10\x01\x12 \n\x1cMANAGED_JOB_STATUS_SUBMITTED\x10\x02\x12\x1f\n\x1bMANAGED_JOB_STATUS_STARTING\x10\x03\x12\x1e\n\x1aMANAGED_JOB_STATUS_RUNNING\x10\x04\x12!\n\x1dMANAGED_JOB_STATUS_RECOVERING\x10\x05\x12!\n\x1dMANAGED_JOB_STATUS_CANCELLING\x10\x06\x12 \n\x1cMANAGED_JOB_STATUS_SUCCEEDED\x10\x07\x12 \n\x1cMANAGED_JOB_STATUS_CANCELLED\x10\x08\x12\x1d\n\x19MANAGED_JOB_STATUS_FAILED\x10\t\x12#\n\x1fMANAGED_JOB_STATUS_FAILED_SETUP\x10\n\x12\'\n#MANAGED_JOB_STATUS_FAILED_PRECHECKS\x10\x0b\x12)\n%MANAGED_JOB_STATUS_FAILED_NO_RESOURCE\x10\x0c\x12(\n$MANAGED_JOB_STATUS_FAILED_CONTROLLER\x10\r*\x9e\x03\n\x17ManagedJobScheduleState\x12*\n&MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED\x10\x00\x12\x35\n-DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID\x10\x01\x1a\x02\x08\x01\x12\'\n#MANAGED_JOB_SCHEDULE_STATE_INACTIVE\x10\x02\x12&\n\"MANAGED_JOB_SCHEDULE_STATE_WAITING\x10\x03\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING\x10\x04\x12(\n$MANAGED_JOB_SCHEDULE_STATE_LAUNCHING\x10\x05\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF\x10\x06\x12$\n MANAGED_JOB_SCHEDULE_STATE_ALIVE\x10\x07\x12#\n\x1fMANAGED_JOB_SCHEDULE_STATE_DONE\x10\x08\x32\xe4\x03\n\x12ManagedJobsService\x12U\n\nGetVersion\x12\".managed_jobs.v1.GetVersionRequest\x1a#.managed_jobs.v1.GetVersionResponse\x12X\n\x0bGetJobTable\x12#.managed_jobs.v1.GetJobTableRequest\x1a$.managed_jobs.v1.GetJobTableResponse\x12m\n\x12GetAllJobIdsByName\x12*.managed_jobs.v1.GetAllJobIdsByNameRequest\x1a+.managed_jobs.v1.GetAllJobIdsByNameResponse\x12U\n\nCancelJobs\x12\".managed_jobs.v1.CancelJobsRequest\x1a#.managed_jobs.v1.CancelJobsResponse\x12W\n\nStreamLogs\x12\".managed_jobs.v1.StreamLogsRequest\x1a#.managed_jobs.v1.StreamLogsResponse0\x01\x62\x06proto3')
|
|
18
18
|
|
|
19
19
|
_globals = globals()
|
|
20
20
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
@@ -27,12 +27,14 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
27
27
|
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_options = b'8\001'
|
|
28
28
|
_globals['_MANAGEDJOBINFO_METADATAENTRY']._loaded_options = None
|
|
29
29
|
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_options = b'8\001'
|
|
30
|
+
_globals['_MANAGEDJOBINFO_LINKSENTRY']._loaded_options = None
|
|
31
|
+
_globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_options = b'8\001'
|
|
30
32
|
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._loaded_options = None
|
|
31
33
|
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_options = b'8\001'
|
|
32
|
-
_globals['_MANAGEDJOBSTATUS']._serialized_start=
|
|
33
|
-
_globals['_MANAGEDJOBSTATUS']._serialized_end=
|
|
34
|
-
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_start=
|
|
35
|
-
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_end=
|
|
34
|
+
_globals['_MANAGEDJOBSTATUS']._serialized_start=2941
|
|
35
|
+
_globals['_MANAGEDJOBSTATUS']._serialized_end=3458
|
|
36
|
+
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_start=3461
|
|
37
|
+
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_end=3875
|
|
36
38
|
_globals['_JOBIDS']._serialized_start=63
|
|
37
39
|
_globals['_JOBIDS']._serialized_end=84
|
|
38
40
|
_globals['_USERHASHES']._serialized_start=86
|
|
@@ -50,27 +52,29 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
50
52
|
_globals['_GETJOBTABLEREQUEST']._serialized_start=278
|
|
51
53
|
_globals['_GETJOBTABLEREQUEST']._serialized_end=887
|
|
52
54
|
_globals['_MANAGEDJOBINFO']._serialized_start=890
|
|
53
|
-
_globals['_MANAGEDJOBINFO']._serialized_end=
|
|
54
|
-
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_start=
|
|
55
|
-
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_end=
|
|
56
|
-
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_start=
|
|
57
|
-
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_end=
|
|
58
|
-
_globals['
|
|
59
|
-
_globals['
|
|
60
|
-
_globals['
|
|
61
|
-
_globals['
|
|
62
|
-
_globals['
|
|
63
|
-
_globals['
|
|
64
|
-
_globals['
|
|
65
|
-
_globals['
|
|
66
|
-
_globals['
|
|
67
|
-
_globals['
|
|
68
|
-
_globals['
|
|
69
|
-
_globals['
|
|
70
|
-
_globals['
|
|
71
|
-
_globals['
|
|
72
|
-
_globals['
|
|
73
|
-
_globals['
|
|
74
|
-
_globals['
|
|
75
|
-
_globals['
|
|
55
|
+
_globals['_MANAGEDJOBINFO']._serialized_end=2094
|
|
56
|
+
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_start=1770
|
|
57
|
+
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_end=1821
|
|
58
|
+
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_start=1823
|
|
59
|
+
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_end=1870
|
|
60
|
+
_globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_start=1872
|
|
61
|
+
_globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_end=1916
|
|
62
|
+
_globals['_GETJOBTABLERESPONSE']._serialized_start=2097
|
|
63
|
+
_globals['_GETJOBTABLERESPONSE']._serialized_end=2337
|
|
64
|
+
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_start=2286
|
|
65
|
+
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_end=2337
|
|
66
|
+
_globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_start=2339
|
|
67
|
+
_globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_end=2402
|
|
68
|
+
_globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_start=2404
|
|
69
|
+
_globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_end=2449
|
|
70
|
+
_globals['_CANCELJOBSREQUEST']._serialized_start=2452
|
|
71
|
+
_globals['_CANCELJOBSREQUEST']._serialized_end=2667
|
|
72
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_start=2669
|
|
73
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_end=2706
|
|
74
|
+
_globals['_STREAMLOGSREQUEST']._serialized_start=2709
|
|
75
|
+
_globals['_STREAMLOGSREQUEST']._serialized_end=2860
|
|
76
|
+
_globals['_STREAMLOGSRESPONSE']._serialized_start=2862
|
|
77
|
+
_globals['_STREAMLOGSRESPONSE']._serialized_end=2938
|
|
78
|
+
_globals['_MANAGEDJOBSSERVICE']._serialized_start=3878
|
|
79
|
+
_globals['_MANAGEDJOBSSERVICE']._serialized_end=4362
|
|
76
80
|
# @@protoc_insertion_point(module_scope)
|
|
@@ -127,7 +127,7 @@ class GetJobTableRequest(_message.Message):
|
|
|
127
127
|
def __init__(self, skip_finished: bool = ..., accessible_workspaces: _Optional[_Union[Workspaces, _Mapping]] = ..., job_ids: _Optional[_Union[JobIds, _Mapping]] = ..., workspace_match: _Optional[str] = ..., name_match: _Optional[str] = ..., pool_match: _Optional[str] = ..., page: _Optional[int] = ..., limit: _Optional[int] = ..., user_hashes: _Optional[_Union[UserHashes, _Mapping]] = ..., statuses: _Optional[_Union[Statuses, _Mapping]] = ..., show_jobs_without_user_hash: bool = ..., fields: _Optional[_Union[Fields, _Mapping]] = ...) -> None: ...
|
|
128
128
|
|
|
129
129
|
class ManagedJobInfo(_message.Message):
|
|
130
|
-
__slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash", "_job_id")
|
|
130
|
+
__slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash", "_job_id", "links")
|
|
131
131
|
class AcceleratorsEntry(_message.Message):
|
|
132
132
|
__slots__ = ("key", "value")
|
|
133
133
|
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
@@ -142,6 +142,13 @@ class ManagedJobInfo(_message.Message):
|
|
|
142
142
|
key: str
|
|
143
143
|
value: str
|
|
144
144
|
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
145
|
+
class LinksEntry(_message.Message):
|
|
146
|
+
__slots__ = ("key", "value")
|
|
147
|
+
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
148
|
+
VALUE_FIELD_NUMBER: _ClassVar[int]
|
|
149
|
+
key: str
|
|
150
|
+
value: str
|
|
151
|
+
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
145
152
|
JOB_ID_FIELD_NUMBER: _ClassVar[int]
|
|
146
153
|
TASK_ID_FIELD_NUMBER: _ClassVar[int]
|
|
147
154
|
JOB_NAME_FIELD_NUMBER: _ClassVar[int]
|
|
@@ -171,6 +178,7 @@ class ManagedJobInfo(_message.Message):
|
|
|
171
178
|
POOL_FIELD_NUMBER: _ClassVar[int]
|
|
172
179
|
POOL_HASH_FIELD_NUMBER: _ClassVar[int]
|
|
173
180
|
_JOB_ID_FIELD_NUMBER: _ClassVar[int]
|
|
181
|
+
LINKS_FIELD_NUMBER: _ClassVar[int]
|
|
174
182
|
job_id: int
|
|
175
183
|
task_id: int
|
|
176
184
|
job_name: str
|
|
@@ -200,7 +208,8 @@ class ManagedJobInfo(_message.Message):
|
|
|
200
208
|
pool: str
|
|
201
209
|
pool_hash: str
|
|
202
210
|
_job_id: int
|
|
203
|
-
|
|
211
|
+
links: _containers.ScalarMap[str, str]
|
|
212
|
+
def __init__(self, job_id: _Optional[int] = ..., task_id: _Optional[int] = ..., job_name: _Optional[str] = ..., task_name: _Optional[str] = ..., job_duration: _Optional[float] = ..., workspace: _Optional[str] = ..., status: _Optional[_Union[ManagedJobStatus, str]] = ..., schedule_state: _Optional[_Union[ManagedJobScheduleState, str]] = ..., resources: _Optional[str] = ..., cluster_resources: _Optional[str] = ..., cluster_resources_full: _Optional[str] = ..., cloud: _Optional[str] = ..., region: _Optional[str] = ..., infra: _Optional[str] = ..., accelerators: _Optional[_Mapping[str, float]] = ..., recovery_count: _Optional[int] = ..., details: _Optional[str] = ..., failure_reason: _Optional[str] = ..., user_name: _Optional[str] = ..., user_hash: _Optional[str] = ..., submitted_at: _Optional[float] = ..., start_at: _Optional[float] = ..., end_at: _Optional[float] = ..., user_yaml: _Optional[str] = ..., entrypoint: _Optional[str] = ..., metadata: _Optional[_Mapping[str, str]] = ..., pool: _Optional[str] = ..., pool_hash: _Optional[str] = ..., _job_id: _Optional[int] = ..., links: _Optional[_Mapping[str, str]] = ...) -> None: ...
|
|
204
213
|
|
|
205
214
|
class GetJobTableResponse(_message.Message):
|
|
206
215
|
__slots__ = ("jobs", "total", "total_no_filter", "status_counts")
|
sky/serve/serve_utils.py
CHANGED
|
@@ -23,6 +23,7 @@ import filelock
|
|
|
23
23
|
from sky import backends
|
|
24
24
|
from sky import exceptions
|
|
25
25
|
from sky import global_user_state
|
|
26
|
+
from sky import resources as resources_lib
|
|
26
27
|
from sky import sky_logging
|
|
27
28
|
from sky import skypilot_config
|
|
28
29
|
from sky.adaptors import common as adaptors_common
|
|
@@ -350,6 +351,13 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
|
350
351
|
f'file does not match the pool argument. '
|
|
351
352
|
f'To fix, add a valid `{field_name}` field.')
|
|
352
353
|
|
|
354
|
+
# Validate that pools do not use ordered resources
|
|
355
|
+
if pool and isinstance(task.resources, list):
|
|
356
|
+
with ux_utils.print_exception_no_traceback():
|
|
357
|
+
raise ValueError(
|
|
358
|
+
'Ordered resources are not supported for pools. '
|
|
359
|
+
'Use `any_of` instead, or specify a single resource.')
|
|
360
|
+
|
|
353
361
|
policy_description = ('on-demand'
|
|
354
362
|
if task.service.dynamic_ondemand_fallback else 'spot')
|
|
355
363
|
for resource in list(task.resources):
|
|
@@ -360,22 +368,6 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
|
360
368
|
f'{sys_name} will replenish preempted spot '
|
|
361
369
|
f'with {policy_description} instances.')
|
|
362
370
|
|
|
363
|
-
if pool:
|
|
364
|
-
accelerators = set()
|
|
365
|
-
for resource in task.resources:
|
|
366
|
-
if resource.accelerators is not None:
|
|
367
|
-
if isinstance(resource.accelerators, str):
|
|
368
|
-
accelerators.add(resource.accelerators)
|
|
369
|
-
elif isinstance(resource.accelerators, dict):
|
|
370
|
-
accelerators.update(resource.accelerators.keys())
|
|
371
|
-
elif isinstance(resource.accelerators, list):
|
|
372
|
-
accelerators.update(resource.accelerators)
|
|
373
|
-
if len(accelerators) > 1:
|
|
374
|
-
with ux_utils.print_exception_no_traceback():
|
|
375
|
-
raise ValueError('Heterogeneous clusters are not supported for '
|
|
376
|
-
'pools please specify one accelerator '
|
|
377
|
-
'for all workers.')
|
|
378
|
-
|
|
379
371
|
# Try to create a spot placer from the task yaml. Check if the task yaml
|
|
380
372
|
# is valid for spot placer.
|
|
381
373
|
spot_placer.SpotPlacer.from_task(task.service, task)
|
|
@@ -730,7 +722,7 @@ def _get_service_status(
|
|
|
730
722
|
for replica_info in record['replica_info']:
|
|
731
723
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
732
724
|
service_name, replica_info['name'])
|
|
733
|
-
replica_info['used_by'] = job_ids
|
|
725
|
+
replica_info['used_by'] = job_ids
|
|
734
726
|
return record
|
|
735
727
|
|
|
736
728
|
|
|
@@ -810,16 +802,112 @@ def get_ready_replicas(
|
|
|
810
802
|
]
|
|
811
803
|
|
|
812
804
|
|
|
813
|
-
def
|
|
814
|
-
|
|
805
|
+
def _task_fits(task_resources: 'resources_lib.Resources',
|
|
806
|
+
free_resources: 'resources_lib.Resources') -> bool:
|
|
807
|
+
"""Check if the task resources fit in the free resources."""
|
|
808
|
+
if not task_resources.less_demanding_than(free_resources,
|
|
809
|
+
check_cloud=False):
|
|
810
|
+
return False
|
|
811
|
+
if task_resources.cpus is not None:
|
|
812
|
+
if (free_resources.cpus is None or
|
|
813
|
+
task_resources.cpus > free_resources.cpus):
|
|
814
|
+
return False
|
|
815
|
+
if task_resources.memory is not None:
|
|
816
|
+
if (free_resources.memory is None or
|
|
817
|
+
task_resources.memory > free_resources.memory):
|
|
818
|
+
return False
|
|
819
|
+
return True
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _is_empty_resource(resource: 'resources_lib.Resources') -> bool:
|
|
823
|
+
# Returns True if this resource object does not specify any resources.
|
|
824
|
+
return (resource.cpus is None and resource.memory is None and
|
|
825
|
+
resource.accelerators is None)
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def get_free_worker_resources(
|
|
829
|
+
pool: str) -> Optional[Dict[str, Optional[resources_lib.Resources]]]:
|
|
830
|
+
"""Get free resources for each worker in a pool.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
pool: Pool name (service name)
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
Dictionary mapping cluster_name (worker) to free Resources object (or
|
|
837
|
+
None if worker is not available or has no free resources).
|
|
838
|
+
"""
|
|
839
|
+
|
|
840
|
+
free_resources: Dict[str, Optional[resources_lib.Resources]] = {}
|
|
841
|
+
replicas = serve_state.get_replica_infos(pool)
|
|
842
|
+
|
|
843
|
+
for replica_info in replicas:
|
|
844
|
+
cluster_name = replica_info.cluster_name
|
|
845
|
+
|
|
846
|
+
# Get cluster handle
|
|
847
|
+
handle = replica_info.handle()
|
|
848
|
+
if handle is None or handle.launched_resources is None:
|
|
849
|
+
free_resources[cluster_name] = None
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
total_resources = handle.launched_resources
|
|
853
|
+
|
|
854
|
+
# Get job IDs running on this worker
|
|
855
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
856
|
+
pool, cluster_name)
|
|
857
|
+
|
|
858
|
+
if len(job_ids) == 0:
|
|
859
|
+
free_resources[cluster_name] = total_resources
|
|
860
|
+
continue
|
|
861
|
+
|
|
862
|
+
# Get used resources
|
|
863
|
+
# TODO(lloyd): We should batch the database calls here so that we
|
|
864
|
+
# make a single call to get all the used resources for all the jobs.
|
|
865
|
+
used_resources = managed_job_state.get_pool_worker_used_resources(
|
|
866
|
+
set(job_ids))
|
|
867
|
+
if used_resources is None:
|
|
868
|
+
# We failed to get the used resources. We should return None since
|
|
869
|
+
# we can't make any guarantees about what resources are being used.
|
|
870
|
+
logger.warning(
|
|
871
|
+
f'Failed to get used resources for cluster {cluster_name!r}')
|
|
872
|
+
return None
|
|
873
|
+
|
|
874
|
+
if _is_empty_resource(used_resources):
|
|
875
|
+
# We encountered a job that has no resources specified. We
|
|
876
|
+
# will not consider it for resource-aware scheduling so it must
|
|
877
|
+
# be scheduled on its own. To do this we will set the free
|
|
878
|
+
# worker resources to nothing by returning an empty resource
|
|
879
|
+
# object.
|
|
880
|
+
logger.debug(f'Job {job_ids} has no resources specified. '
|
|
881
|
+
'Skipping resource-aware scheduling for cluster '
|
|
882
|
+
f'{cluster_name!r}')
|
|
883
|
+
free_resources[cluster_name] = resources_lib.Resources()
|
|
884
|
+
else:
|
|
885
|
+
# Calculate free resources using - operator
|
|
886
|
+
free = total_resources - used_resources
|
|
887
|
+
free_resources[cluster_name] = free
|
|
888
|
+
|
|
889
|
+
return free_resources
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def get_next_cluster_name(
|
|
893
|
+
service_name: str,
|
|
894
|
+
job_id: int,
|
|
895
|
+
task_resources: Optional[typing.Union[
|
|
896
|
+
'resources_lib.Resources', typing.Set['resources_lib.Resources'],
|
|
897
|
+
typing.List['resources_lib.Resources']]] = None
|
|
898
|
+
) -> Optional[str]:
|
|
899
|
+
"""Get the next available cluster name from replicas with sufficient
|
|
900
|
+
resources.
|
|
815
901
|
|
|
816
902
|
Args:
|
|
817
903
|
service_name: The name of the service.
|
|
818
|
-
job_id:
|
|
819
|
-
|
|
904
|
+
job_id: Job ID to associate with the acquired cluster.
|
|
905
|
+
task_resources: Optional task resource requirements. If provided, will
|
|
906
|
+
check if resources fit in free worker resources. Can be
|
|
907
|
+
a single Resources object or a set/list of Resources objects.
|
|
820
908
|
|
|
821
909
|
Returns:
|
|
822
|
-
The cluster name if
|
|
910
|
+
The cluster name if a suitable replica is found, None otherwise.
|
|
823
911
|
"""
|
|
824
912
|
# Check if service exists
|
|
825
913
|
service_status = _get_service_status(service_name,
|
|
@@ -831,36 +919,126 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
|
831
919
|
if not service_status['pool']:
|
|
832
920
|
logger.error(f'Service {service_name!r} is not a pool.')
|
|
833
921
|
return None
|
|
922
|
+
|
|
834
923
|
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
924
|
+
free_resources = get_free_worker_resources(service_name)
|
|
925
|
+
logger.debug(f'Free resources: {free_resources!r}')
|
|
835
926
|
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
836
927
|
ready_replicas = get_ready_replicas(service_name)
|
|
928
|
+
|
|
929
|
+
logger.debug(f'Ready replicas: {ready_replicas!r}')
|
|
930
|
+
|
|
837
931
|
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
932
|
+
|
|
933
|
+
# If task_resources is provided, use resource-aware scheduling
|
|
934
|
+
# Normalize task_resources to a list
|
|
935
|
+
if isinstance(task_resources, resources_lib.Resources):
|
|
936
|
+
task_resources_list = [task_resources]
|
|
937
|
+
elif isinstance(task_resources, (set, list)):
|
|
938
|
+
task_resources_list = list(task_resources)
|
|
939
|
+
else:
|
|
940
|
+
task_resources_list = []
|
|
941
|
+
|
|
942
|
+
# We should do resource aware scheduling if:
|
|
943
|
+
# 1. There are task resources.
|
|
944
|
+
# 2. The first task resource has some resources listed.
|
|
945
|
+
# 3. There are free resources.
|
|
946
|
+
# 4. Any free resource has some resources listed.
|
|
947
|
+
resource_aware = len(task_resources_list) > 0
|
|
948
|
+
resource_aware = (resource_aware and
|
|
949
|
+
not _is_empty_resource(task_resources_list[0]))
|
|
950
|
+
resource_aware = resource_aware and free_resources is not None
|
|
951
|
+
if free_resources is not None:
|
|
952
|
+
for free_resource in free_resources.values():
|
|
953
|
+
if free_resource is not None and not _is_empty_resource(
|
|
954
|
+
free_resource):
|
|
955
|
+
resource_aware = True
|
|
956
|
+
break
|
|
957
|
+
else:
|
|
958
|
+
resource_aware = False
|
|
959
|
+
else:
|
|
960
|
+
resource_aware = False
|
|
961
|
+
|
|
962
|
+
if resource_aware:
|
|
963
|
+
logger.debug('Doing resource aware scheduling')
|
|
964
|
+
for replica_info in ready_replicas:
|
|
965
|
+
cluster_name = replica_info.cluster_name
|
|
966
|
+
assert free_resources is not None
|
|
967
|
+
free_resources_on_worker = free_resources.get(cluster_name)
|
|
968
|
+
logger.debug(f'Free resources for cluster {cluster_name!r}: '
|
|
969
|
+
f'{free_resources_on_worker!r}')
|
|
970
|
+
|
|
971
|
+
# Skip if worker has no free resources available
|
|
972
|
+
if free_resources_on_worker is None:
|
|
973
|
+
logger.debug(f'Worker {cluster_name!r} has no free '
|
|
974
|
+
'resources')
|
|
975
|
+
continue
|
|
976
|
+
|
|
977
|
+
# Check if any of the task resource options fit
|
|
978
|
+
fits = False
|
|
979
|
+
for task_res in task_resources_list:
|
|
980
|
+
logger.debug(f'Task resources: {task_res!r}')
|
|
981
|
+
if _task_fits(task_res, free_resources_on_worker):
|
|
982
|
+
logger.debug(f'Task resources {task_res!r} fits'
|
|
983
|
+
' in free resources '
|
|
984
|
+
f'{free_resources_on_worker!r}')
|
|
985
|
+
fits = True
|
|
986
|
+
break
|
|
987
|
+
else:
|
|
988
|
+
logger.debug(f'Task resources {task_res!r} does not fit'
|
|
989
|
+
' in free resources '
|
|
990
|
+
f'{free_resources_on_worker!r}')
|
|
991
|
+
if fits:
|
|
992
|
+
idle_replicas.append(replica_info)
|
|
993
|
+
# Also fall back to resource unaware scheduling if no idle replicas are
|
|
994
|
+
# found. This might be because our launched resources were improperly
|
|
995
|
+
# set. If that's the case then jobs will fail to schedule in a resource
|
|
996
|
+
# aware way because one of the resources will be `None` so we can just
|
|
997
|
+
# fallback to 1 job per replica. If we are truly resource bottlenecked
|
|
998
|
+
# then we will see that there are jobs running on the replica and will
|
|
999
|
+
# not schedule another.
|
|
1000
|
+
if len(idle_replicas) == 0:
|
|
1001
|
+
logger.debug('Falling back to resource unaware scheduling')
|
|
1002
|
+
# Fall back to resource unaware scheduling if no task resources
|
|
1003
|
+
# are provided.
|
|
1004
|
+
for replica_info in ready_replicas:
|
|
1005
|
+
jobs_on_replica = (
|
|
1006
|
+
managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
1007
|
+
service_name, replica_info.cluster_name))
|
|
1008
|
+
if not jobs_on_replica:
|
|
1009
|
+
idle_replicas.append(replica_info)
|
|
1010
|
+
|
|
854
1011
|
if not idle_replicas:
|
|
855
1012
|
logger.info(f'No idle replicas found for pool {service_name!r}')
|
|
856
1013
|
return None
|
|
857
1014
|
|
|
858
1015
|
# Select the first idle replica.
|
|
859
|
-
# TODO(tian): "Load balancing" policy.
|
|
860
1016
|
replica_info = idle_replicas[0]
|
|
861
1017
|
logger.info(f'Selected replica {replica_info.replica_id} with cluster '
|
|
862
1018
|
f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
|
|
863
1019
|
f'{service_name!r}')
|
|
1020
|
+
|
|
1021
|
+
# If job has heterogeneous resources (any_of/ordered), update
|
|
1022
|
+
# full_resources to the specific resource that was selected for this
|
|
1023
|
+
# worker. This must happen before releasing the filelock to ensure
|
|
1024
|
+
# atomicity with the scheduling decision.
|
|
1025
|
+
if resource_aware and len(task_resources_list) > 1:
|
|
1026
|
+
assert free_resources is not None
|
|
1027
|
+
free_resources_on_worker = free_resources.get(
|
|
1028
|
+
replica_info.cluster_name)
|
|
1029
|
+
if free_resources_on_worker is not None:
|
|
1030
|
+
# Find which task resource fits on this worker
|
|
1031
|
+
for task_res in task_resources_list:
|
|
1032
|
+
if _task_fits(task_res, free_resources_on_worker):
|
|
1033
|
+
# Update full_resources in database to this specific
|
|
1034
|
+
# resource
|
|
1035
|
+
logger.debug(
|
|
1036
|
+
f'Updating full_resources for job {job_id!r} '
|
|
1037
|
+
f'to selected resource: {task_res!r}')
|
|
1038
|
+
managed_job_state.update_job_full_resources(
|
|
1039
|
+
job_id, task_res.to_yaml_config())
|
|
1040
|
+
break
|
|
1041
|
+
|
|
864
1042
|
managed_job_state.set_current_cluster_name(job_id,
|
|
865
1043
|
replica_info.cluster_name)
|
|
866
1044
|
return replica_info.cluster_name
|
|
@@ -1541,7 +1719,21 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
|
1541
1719
|
replica_status = record['status']
|
|
1542
1720
|
status_str = replica_status.colored_str()
|
|
1543
1721
|
used_by = record.get('used_by', None)
|
|
1544
|
-
|
|
1722
|
+
if used_by is None:
|
|
1723
|
+
used_by_str = '-'
|
|
1724
|
+
elif isinstance(used_by, str):
|
|
1725
|
+
used_by_str = used_by
|
|
1726
|
+
else:
|
|
1727
|
+
if len(used_by) > 2:
|
|
1728
|
+
used_by_str = (
|
|
1729
|
+
f'{used_by[0]}, {used_by[1]}, +{len(used_by) - 2}'
|
|
1730
|
+
' more')
|
|
1731
|
+
elif len(used_by) == 2:
|
|
1732
|
+
used_by_str = f'{used_by[0]}, {used_by[1]}'
|
|
1733
|
+
elif len(used_by) == 1:
|
|
1734
|
+
used_by_str = str(used_by[0])
|
|
1735
|
+
else:
|
|
1736
|
+
used_by_str = '-'
|
|
1545
1737
|
|
|
1546
1738
|
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
|
1547
1739
|
'handle']
|
sky/serve/server/impl.py
CHANGED
|
@@ -517,7 +517,7 @@ def update(
|
|
|
517
517
|
f'{workers} is not supported. Ignoring the update.')
|
|
518
518
|
|
|
519
519
|
# Load the existing task configuration from the service's YAML file
|
|
520
|
-
yaml_content = service_record['
|
|
520
|
+
yaml_content = service_record['pool_yaml']
|
|
521
521
|
|
|
522
522
|
# Load the existing task configuration
|
|
523
523
|
task = task_lib.Task.from_yaml_str(yaml_content)
|
sky/server/common.py
CHANGED
|
@@ -17,6 +17,7 @@ import time
|
|
|
17
17
|
import typing
|
|
18
18
|
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
19
|
Tuple, TypeVar, Union)
|
|
20
|
+
from urllib.request import Request
|
|
20
21
|
import uuid
|
|
21
22
|
|
|
22
23
|
import cachetools
|
|
@@ -147,6 +148,22 @@ def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
|
|
147
148
|
return cookie_jar
|
|
148
149
|
|
|
149
150
|
|
|
151
|
+
def get_cookie_header_for_url(url: str) -> Dict[str, str]:
|
|
152
|
+
"""Extract Cookie header value from a cookie jar for a specific URL"""
|
|
153
|
+
cookies = get_api_cookie_jar()
|
|
154
|
+
if not cookies:
|
|
155
|
+
return {}
|
|
156
|
+
|
|
157
|
+
# Use urllib Request to do URL-aware cookie filtering
|
|
158
|
+
request = Request(url)
|
|
159
|
+
cookies.add_cookie_header(request)
|
|
160
|
+
cookie_header = request.get_header('Cookie')
|
|
161
|
+
|
|
162
|
+
if cookie_header is None:
|
|
163
|
+
return {}
|
|
164
|
+
return {'Cookie': cookie_header}
|
|
165
|
+
|
|
166
|
+
|
|
150
167
|
def set_api_cookie_jar(cookie_jar: CookieJar,
|
|
151
168
|
create_if_not_exists: bool = True) -> None:
|
|
152
169
|
"""Updates the file cookie jar with the given cookie jar."""
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 26
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/metrics.py
CHANGED
|
@@ -48,10 +48,12 @@ async def gpu_metrics() -> fastapi.Response:
|
|
|
48
48
|
all_metrics: List[str] = []
|
|
49
49
|
successful_contexts = 0
|
|
50
50
|
|
|
51
|
+
remote_contexts = [
|
|
52
|
+
context for context in contexts if context != 'in-cluster'
|
|
53
|
+
]
|
|
51
54
|
tasks = [
|
|
52
55
|
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
53
|
-
for context in
|
|
54
|
-
if context != 'in-cluster'
|
|
56
|
+
for context in remote_contexts
|
|
55
57
|
]
|
|
56
58
|
|
|
57
59
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
@@ -59,7 +61,8 @@ async def gpu_metrics() -> fastapi.Response:
|
|
|
59
61
|
for i, result in enumerate(results):
|
|
60
62
|
if isinstance(result, Exception):
|
|
61
63
|
logger.error(
|
|
62
|
-
f'Failed to get metrics for context {
|
|
64
|
+
f'Failed to get metrics for context {remote_contexts[i]}: '
|
|
65
|
+
f'{result}')
|
|
63
66
|
elif isinstance(result, BaseException):
|
|
64
67
|
# Avoid changing behavior for non-Exception BaseExceptions
|
|
65
68
|
# like KeyboardInterrupt/SystemExit: re-raise them.
|