skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
--- a/worker.py
|
|
2
|
+
+++ b/worker.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py
|
|
5
|
+
+# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233
|
|
6
|
+
+# Tracked in PR https://github.com/ray-project/ray/pull/21977/files.
|
|
7
|
+
+
|
|
8
|
+
import atexit
|
|
9
|
+
import faulthandler
|
|
10
|
+
import functools
|
|
11
|
+
@@ -2020,6 +2024,14 @@
|
|
12
|
+
pid = data.get("pid")
|
|
13
|
+
lines = data.get("lines", [])
|
|
14
|
+
|
|
15
|
+
+ def end_for(line: str) -> str:
|
|
16
|
+
+ if sys.platform == "win32":
|
|
17
|
+
+ return "\n"
|
|
18
|
+
+ if line.endswith("\r"):
|
|
19
|
+
+ return ""
|
|
20
|
+
+ return "\n"
|
|
21
|
+
+
|
|
22
|
+
+
|
|
23
|
+
if data.get("ip") == data.get("localhost"):
|
|
24
|
+
for line in lines:
|
|
25
|
+
if RAY_TQDM_MAGIC in line:
|
|
26
|
+
@@ -2035,6 +2047,7 @@
|
|
27
|
+
message_for(data, line),
|
|
28
|
+
),
|
|
29
|
+
file=print_file,
|
|
30
|
+
+ end=end_for(line),
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
for line in lines:
|
|
34
|
+
@@ -2052,6 +2065,7 @@
|
|
35
|
+
message_for(data, line),
|
|
36
|
+
),
|
|
37
|
+
file=print_file,
|
|
38
|
+
+ end=end_for(line),
|
|
39
|
+
)
|
|
40
|
+
# Restore once at end of batch to avoid excess hiding/unhiding of tqdm.
|
|
41
|
+
restore_tqdm()
|
sky/skylet/services.py
ADDED
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
"""gRPC service implementations for skylet."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import grpc
|
|
7
|
+
|
|
8
|
+
from sky import exceptions
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.jobs import state as managed_job_state
|
|
11
|
+
from sky.jobs import utils as managed_job_utils
|
|
12
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
13
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
14
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
15
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
16
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
18
|
+
from sky.schemas.generated import servev1_pb2
|
|
19
|
+
from sky.schemas.generated import servev1_pb2_grpc
|
|
20
|
+
from sky.serve import serve_rpc_utils
|
|
21
|
+
from sky.serve import serve_state
|
|
22
|
+
from sky.serve import serve_utils
|
|
23
|
+
from sky.skylet import autostop_lib
|
|
24
|
+
from sky.skylet import constants
|
|
25
|
+
from sky.skylet import job_lib
|
|
26
|
+
from sky.skylet import log_lib
|
|
27
|
+
|
|
28
|
+
logger = sky_logging.init_logger(__name__)
|
|
29
|
+
|
|
30
|
+
# In the worst case, flush the log buffer every 50ms,
|
|
31
|
+
# to ensure responsiveness.
|
|
32
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
36
|
+
"""Implementation of the AutostopService gRPC service."""
|
|
37
|
+
|
|
38
|
+
def SetAutostop( # type: ignore[return]
|
|
39
|
+
self, request: autostopv1_pb2.SetAutostopRequest,
|
|
40
|
+
context: grpc.ServicerContext
|
|
41
|
+
) -> autostopv1_pb2.SetAutostopResponse:
|
|
42
|
+
"""Sets autostop configuration for the cluster."""
|
|
43
|
+
try:
|
|
44
|
+
wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
|
|
45
|
+
request.wait_for)
|
|
46
|
+
autostop_lib.set_autostop(
|
|
47
|
+
idle_minutes=request.idle_minutes,
|
|
48
|
+
backend=request.backend,
|
|
49
|
+
wait_for=wait_for if wait_for is not None else
|
|
50
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
51
|
+
down=request.down)
|
|
52
|
+
return autostopv1_pb2.SetAutostopResponse()
|
|
53
|
+
except Exception as e: # pylint: disable=broad-except
|
|
54
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
55
|
+
|
|
56
|
+
def IsAutostopping( # type: ignore[return]
|
|
57
|
+
self, request: autostopv1_pb2.IsAutostoppingRequest,
|
|
58
|
+
context: grpc.ServicerContext
|
|
59
|
+
) -> autostopv1_pb2.IsAutostoppingResponse:
|
|
60
|
+
"""Checks if the cluster is currently autostopping."""
|
|
61
|
+
try:
|
|
62
|
+
is_autostopping = autostop_lib.get_is_autostopping()
|
|
63
|
+
return autostopv1_pb2.IsAutostoppingResponse(
|
|
64
|
+
is_autostopping=is_autostopping)
|
|
65
|
+
except Exception as e: # pylint: disable=broad-except
|
|
66
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ServeServiceImpl(servev1_pb2_grpc.ServeServiceServicer):
|
|
70
|
+
"""Implementation of the ServeService gRPC service."""
|
|
71
|
+
|
|
72
|
+
# NOTE (kyuds): this grpc service will run cluster-side,
|
|
73
|
+
# thus guaranteeing that SERVE_VERSION is above 5.
|
|
74
|
+
# Therefore, we removed some SERVE_VERSION checks
|
|
75
|
+
# present in the original codegen.
|
|
76
|
+
|
|
77
|
+
def GetServiceStatus( # type: ignore[return]
|
|
78
|
+
self, request: servev1_pb2.GetServiceStatusRequest,
|
|
79
|
+
context: grpc.ServicerContext
|
|
80
|
+
) -> servev1_pb2.GetServiceStatusResponse:
|
|
81
|
+
"""Gets serve status."""
|
|
82
|
+
try:
|
|
83
|
+
service_names, pool = (
|
|
84
|
+
serve_rpc_utils.GetServiceStatusRequestConverter.from_proto(request)) # pylint: disable=line-too-long
|
|
85
|
+
statuses = serve_utils.get_service_status_pickled(
|
|
86
|
+
service_names, pool)
|
|
87
|
+
return serve_rpc_utils.GetServiceStatusResponseConverter.to_proto(
|
|
88
|
+
statuses)
|
|
89
|
+
except Exception as e: # pylint: disable=broad-except
|
|
90
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
91
|
+
|
|
92
|
+
def AddVersion( # type: ignore[return]
|
|
93
|
+
self, request: servev1_pb2.AddVersionRequest,
|
|
94
|
+
context: grpc.ServicerContext) -> servev1_pb2.AddVersionResponse:
|
|
95
|
+
"""Adds serve version"""
|
|
96
|
+
try:
|
|
97
|
+
service_name = request.service_name
|
|
98
|
+
version = serve_state.add_version(service_name)
|
|
99
|
+
return servev1_pb2.AddVersionResponse(version=version)
|
|
100
|
+
except Exception as e: # pylint: disable=broad-except
|
|
101
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
102
|
+
|
|
103
|
+
def TerminateServices( # type: ignore[return]
|
|
104
|
+
self, request: servev1_pb2.TerminateServicesRequest,
|
|
105
|
+
context: grpc.ServicerContext
|
|
106
|
+
) -> servev1_pb2.TerminateServicesResponse:
|
|
107
|
+
"""Terminates serve"""
|
|
108
|
+
try:
|
|
109
|
+
service_names, purge, pool = (
|
|
110
|
+
serve_rpc_utils.TerminateServicesRequestConverter.from_proto(request)) # pylint: disable=line-too-long
|
|
111
|
+
message = serve_utils.terminate_services(service_names, purge, pool)
|
|
112
|
+
return servev1_pb2.TerminateServicesResponse(message=message)
|
|
113
|
+
except Exception as e: # pylint: disable=broad-except
|
|
114
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
115
|
+
|
|
116
|
+
def TerminateReplica( # type: ignore[return]
|
|
117
|
+
self, request: servev1_pb2.TerminateReplicaRequest,
|
|
118
|
+
context: grpc.ServicerContext
|
|
119
|
+
) -> servev1_pb2.TerminateReplicaResponse:
|
|
120
|
+
"""Terminate replica"""
|
|
121
|
+
try:
|
|
122
|
+
service_name = request.service_name
|
|
123
|
+
replica_id = request.replica_id
|
|
124
|
+
purge = request.purge
|
|
125
|
+
message = serve_utils.terminate_replica(service_name, replica_id,
|
|
126
|
+
purge)
|
|
127
|
+
return servev1_pb2.TerminateReplicaResponse(message=message)
|
|
128
|
+
except Exception as e: # pylint: disable=broad-except
|
|
129
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
130
|
+
|
|
131
|
+
def WaitServiceRegistration( # type: ignore[return]
|
|
132
|
+
self, request: servev1_pb2.WaitServiceRegistrationRequest,
|
|
133
|
+
context: grpc.ServicerContext
|
|
134
|
+
) -> servev1_pb2.WaitServiceRegistrationResponse:
|
|
135
|
+
"""Wait for service to be registered"""
|
|
136
|
+
try:
|
|
137
|
+
service_name = request.service_name
|
|
138
|
+
job_id = request.job_id
|
|
139
|
+
pool = request.pool
|
|
140
|
+
encoded = serve_utils.wait_service_registration(
|
|
141
|
+
service_name, job_id, pool)
|
|
142
|
+
lb_port = serve_utils.load_service_initialization_result(encoded)
|
|
143
|
+
return servev1_pb2.WaitServiceRegistrationResponse(lb_port=lb_port)
|
|
144
|
+
except Exception as e: # pylint: disable=broad-except
|
|
145
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
146
|
+
|
|
147
|
+
def UpdateService( # type: ignore[return]
|
|
148
|
+
self, request: servev1_pb2.UpdateServiceRequest,
|
|
149
|
+
context: grpc.ServicerContext) -> servev1_pb2.UpdateServiceResponse:
|
|
150
|
+
"""Update service"""
|
|
151
|
+
try:
|
|
152
|
+
service_name = request.service_name
|
|
153
|
+
version = request.version
|
|
154
|
+
mode = request.mode
|
|
155
|
+
pool = request.pool
|
|
156
|
+
serve_utils.update_service_encoded(service_name, version, mode,
|
|
157
|
+
pool)
|
|
158
|
+
return servev1_pb2.UpdateServiceResponse()
|
|
159
|
+
except Exception as e: # pylint: disable=broad-except
|
|
160
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
164
|
+
"""Implementation of the JobsService gRPC service."""
|
|
165
|
+
|
|
166
|
+
def AddJob( # type: ignore[return]
|
|
167
|
+
self, request: jobsv1_pb2.AddJobRequest,
|
|
168
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.AddJobResponse:
|
|
169
|
+
try:
|
|
170
|
+
job_name = request.job_name if request.HasField('job_name') else '-'
|
|
171
|
+
job_id, log_dir = job_lib.add_job(job_name, request.username,
|
|
172
|
+
request.run_timestamp,
|
|
173
|
+
request.resources_str,
|
|
174
|
+
request.metadata)
|
|
175
|
+
return jobsv1_pb2.AddJobResponse(job_id=job_id, log_dir=log_dir)
|
|
176
|
+
except Exception as e: # pylint: disable=broad-except
|
|
177
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
178
|
+
|
|
179
|
+
def QueueJob( # type: ignore[return]
|
|
180
|
+
self, request: jobsv1_pb2.QueueJobRequest,
|
|
181
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.QueueJobResponse:
|
|
182
|
+
try:
|
|
183
|
+
job_id = request.job_id
|
|
184
|
+
# Create log directory and file
|
|
185
|
+
remote_log_dir = os.path.expanduser(request.remote_log_dir)
|
|
186
|
+
os.makedirs(remote_log_dir, exist_ok=True)
|
|
187
|
+
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
188
|
+
open(remote_log_path, 'a').close() # pylint: disable=unspecified-encoding
|
|
189
|
+
|
|
190
|
+
script_path = os.path.expanduser(request.script_path)
|
|
191
|
+
os.makedirs(os.path.dirname(script_path), exist_ok=True)
|
|
192
|
+
|
|
193
|
+
# If `codegen` is not provided, assume script is already
|
|
194
|
+
# uploaded to `script_path` via rsync.
|
|
195
|
+
if request.HasField('codegen'):
|
|
196
|
+
with open(script_path, 'w', encoding='utf-8') as f:
|
|
197
|
+
f.write(request.codegen)
|
|
198
|
+
os.chmod(script_path, 0o755)
|
|
199
|
+
|
|
200
|
+
cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
|
|
201
|
+
job_submit_cmd = (
|
|
202
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
203
|
+
# retrieved with pid is the same driver process.
|
|
204
|
+
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
205
|
+
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
206
|
+
# Do not use &>, which is not POSIX and may not work.
|
|
207
|
+
# Note that the order of ">filename 2>&1" matters.
|
|
208
|
+
f' > {remote_log_path} 2>&1')
|
|
209
|
+
job_lib.scheduler.queue(job_id, job_submit_cmd)
|
|
210
|
+
|
|
211
|
+
if request.HasField('managed_job'):
|
|
212
|
+
managed_job = request.managed_job
|
|
213
|
+
pool = managed_job.pool if managed_job.HasField(
|
|
214
|
+
'pool') else None
|
|
215
|
+
pool_hash = None
|
|
216
|
+
if pool is not None:
|
|
217
|
+
pool_hash = serve_state.get_service_hash(pool)
|
|
218
|
+
# Add the managed job to job queue database.
|
|
219
|
+
user_id = managed_job.user_id if managed_job.HasField(
|
|
220
|
+
'user_id') else None
|
|
221
|
+
managed_job_state.set_job_info(job_id, managed_job.name,
|
|
222
|
+
managed_job.workspace,
|
|
223
|
+
managed_job.entrypoint, pool,
|
|
224
|
+
pool_hash, user_id)
|
|
225
|
+
# Set the managed job to PENDING state to make sure that
|
|
226
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
227
|
+
# if it needs to wait to be submitted.
|
|
228
|
+
# We cannot set the managed job to PENDING state in the
|
|
229
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
230
|
+
# to wait for the run commands to be scheduled on the job
|
|
231
|
+
# controller in high-load cases.
|
|
232
|
+
for task in managed_job.tasks:
|
|
233
|
+
managed_job_state.set_pending(job_id, task.task_id,
|
|
234
|
+
task.name, task.resources_str,
|
|
235
|
+
task.metadata_json)
|
|
236
|
+
return jobsv1_pb2.QueueJobResponse()
|
|
237
|
+
except Exception as e: # pylint: disable=broad-except
|
|
238
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
239
|
+
|
|
240
|
+
def UpdateStatus( # type: ignore[return]
|
|
241
|
+
self, request: jobsv1_pb2.UpdateStatusRequest,
|
|
242
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
|
|
243
|
+
try:
|
|
244
|
+
job_lib.update_status()
|
|
245
|
+
return jobsv1_pb2.UpdateStatusResponse()
|
|
246
|
+
except Exception as e: # pylint: disable=broad-except
|
|
247
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
248
|
+
|
|
249
|
+
def GetJobQueue( # type: ignore[return]
|
|
250
|
+
self, request: jobsv1_pb2.GetJobQueueRequest,
|
|
251
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.GetJobQueueResponse:
|
|
252
|
+
try:
|
|
253
|
+
user_hash = request.user_hash if request.HasField(
|
|
254
|
+
'user_hash') else None
|
|
255
|
+
all_jobs = request.all_jobs
|
|
256
|
+
jobs_info = job_lib.get_jobs_info(user_hash=user_hash,
|
|
257
|
+
all_jobs=all_jobs)
|
|
258
|
+
return jobsv1_pb2.GetJobQueueResponse(jobs=jobs_info)
|
|
259
|
+
except Exception as e: # pylint: disable=broad-except
|
|
260
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
261
|
+
|
|
262
|
+
def CancelJobs( # type: ignore[return]
|
|
263
|
+
self, request: jobsv1_pb2.CancelJobsRequest,
|
|
264
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.CancelJobsResponse:
|
|
265
|
+
try:
|
|
266
|
+
job_ids = list(request.job_ids) if request.job_ids else []
|
|
267
|
+
user_hash = request.user_hash if request.HasField(
|
|
268
|
+
'user_hash') else None
|
|
269
|
+
cancelled_job_ids = job_lib.cancel_jobs(job_ids, request.cancel_all,
|
|
270
|
+
user_hash)
|
|
271
|
+
return jobsv1_pb2.CancelJobsResponse(
|
|
272
|
+
cancelled_job_ids=cancelled_job_ids)
|
|
273
|
+
except Exception as e: # pylint: disable=broad-except
|
|
274
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
275
|
+
|
|
276
|
+
def FailAllInProgressJobs( # type: ignore[return]
|
|
277
|
+
self, _: jobsv1_pb2.FailAllInProgressJobsRequest,
|
|
278
|
+
context: grpc.ServicerContext
|
|
279
|
+
) -> jobsv1_pb2.FailAllInProgressJobsResponse:
|
|
280
|
+
try:
|
|
281
|
+
job_lib.fail_all_jobs_in_progress()
|
|
282
|
+
return jobsv1_pb2.FailAllInProgressJobsResponse()
|
|
283
|
+
except Exception as e: # pylint: disable=broad-except
|
|
284
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
285
|
+
|
|
286
|
+
def TailLogs(
|
|
287
|
+
self,
|
|
288
|
+
request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
|
|
289
|
+
context: grpc.ServicerContext):
|
|
290
|
+
buffer = log_lib.LogBuffer()
|
|
291
|
+
try:
|
|
292
|
+
job_id = request.job_id if request.HasField(
|
|
293
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
294
|
+
managed_job_id = request.managed_job_id if request.HasField(
|
|
295
|
+
'managed_job_id') else None
|
|
296
|
+
log_dir = job_lib.get_log_dir_for_job(job_id)
|
|
297
|
+
if log_dir is None:
|
|
298
|
+
run_timestamp = job_lib.get_run_timestamp(job_id)
|
|
299
|
+
log_dir = None if run_timestamp is None else os.path.join(
|
|
300
|
+
constants.SKY_LOGS_DIRECTORY, run_timestamp)
|
|
301
|
+
|
|
302
|
+
for line in log_lib.buffered_iter_with_timeout(
|
|
303
|
+
buffer,
|
|
304
|
+
log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
|
|
305
|
+
request.follow, request.tail),
|
|
306
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
|
|
307
|
+
yield jobsv1_pb2.TailLogsResponse(log_line=line)
|
|
308
|
+
|
|
309
|
+
job_status = job_lib.get_status(job_id)
|
|
310
|
+
exit_code = exceptions.JobExitCode.from_job_status(job_status)
|
|
311
|
+
# Fix for dashboard: When follow=False and job is still running
|
|
312
|
+
# (NOT_FINISHED=101), exit with success (0) since fetching current
|
|
313
|
+
# logs is a successful operation.
|
|
314
|
+
# This prevents shell wrappers from printing "command terminated
|
|
315
|
+
# with exit code 101".
|
|
316
|
+
exit_code_int = 0 if not request.follow and int(
|
|
317
|
+
exit_code) == 101 else int(exit_code)
|
|
318
|
+
yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
|
|
319
|
+
except Exception as e: # pylint: disable=broad-except
|
|
320
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
321
|
+
finally:
|
|
322
|
+
buffer.close()
|
|
323
|
+
|
|
324
|
+
def GetJobStatus( # type: ignore[return]
|
|
325
|
+
self, request: jobsv1_pb2.GetJobStatusRequest,
|
|
326
|
+
context: grpc.ServicerContext) -> jobsv1_pb2.GetJobStatusResponse:
|
|
327
|
+
try:
|
|
328
|
+
if request.job_ids:
|
|
329
|
+
job_ids = list(request.job_ids)
|
|
330
|
+
else:
|
|
331
|
+
latest_job_id = job_lib.get_latest_job_id()
|
|
332
|
+
job_ids = [latest_job_id] if latest_job_id is not None else []
|
|
333
|
+
job_statuses = job_lib.get_statuses(job_ids)
|
|
334
|
+
for job_id, status in job_statuses.items():
|
|
335
|
+
job_statuses[job_id] = job_lib.JobStatus(status).to_protobuf(
|
|
336
|
+
) if status is not None else jobsv1_pb2.JOB_STATUS_UNSPECIFIED
|
|
337
|
+
return jobsv1_pb2.GetJobStatusResponse(job_statuses=job_statuses)
|
|
338
|
+
except Exception as e: # pylint: disable=broad-except
|
|
339
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
340
|
+
|
|
341
|
+
def GetJobSubmittedTimestamp( # type: ignore[return]
|
|
342
|
+
self, request: jobsv1_pb2.GetJobSubmittedTimestampRequest,
|
|
343
|
+
context: grpc.ServicerContext
|
|
344
|
+
) -> jobsv1_pb2.GetJobSubmittedTimestampResponse:
|
|
345
|
+
try:
|
|
346
|
+
job_id = request.job_id if request.HasField(
|
|
347
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
348
|
+
timestamp = job_lib.get_job_submitted_or_ended_timestamp(
|
|
349
|
+
job_id, False)
|
|
350
|
+
if timestamp is None:
|
|
351
|
+
context.abort(grpc.StatusCode.NOT_FOUND,
|
|
352
|
+
f'Job {job_id} not found')
|
|
353
|
+
return jobsv1_pb2.GetJobSubmittedTimestampResponse(
|
|
354
|
+
timestamp=timestamp)
|
|
355
|
+
except Exception as e: # pylint: disable=broad-except
|
|
356
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
357
|
+
|
|
358
|
+
def GetJobEndedTimestamp( # type: ignore[return]
|
|
359
|
+
self, request: jobsv1_pb2.GetJobEndedTimestampRequest,
|
|
360
|
+
context: grpc.ServicerContext
|
|
361
|
+
) -> jobsv1_pb2.GetJobEndedTimestampResponse:
|
|
362
|
+
try:
|
|
363
|
+
job_id = request.job_id if request.HasField(
|
|
364
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
365
|
+
timestamp = job_lib.get_job_submitted_or_ended_timestamp(
|
|
366
|
+
job_id, True)
|
|
367
|
+
if timestamp is None:
|
|
368
|
+
context.abort(grpc.StatusCode.NOT_FOUND,
|
|
369
|
+
f'Job {job_id} not found or not ended')
|
|
370
|
+
return jobsv1_pb2.GetJobEndedTimestampResponse(timestamp=timestamp)
|
|
371
|
+
except Exception as e: # pylint: disable=broad-except
|
|
372
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
373
|
+
|
|
374
|
+
def GetLogDirsForJobs( # type: ignore[return]
|
|
375
|
+
self, request: jobsv1_pb2.GetLogDirsForJobsRequest,
|
|
376
|
+
context: grpc.ServicerContext
|
|
377
|
+
) -> jobsv1_pb2.GetLogDirsForJobsResponse:
|
|
378
|
+
try:
|
|
379
|
+
if request.job_ids:
|
|
380
|
+
job_ids = list(request.job_ids)
|
|
381
|
+
else:
|
|
382
|
+
latest_job_id = job_lib.get_latest_job_id()
|
|
383
|
+
job_ids = [latest_job_id] if latest_job_id is not None else []
|
|
384
|
+
job_log_dirs = job_lib.get_job_log_dirs(job_ids)
|
|
385
|
+
return jobsv1_pb2.GetLogDirsForJobsResponse(
|
|
386
|
+
job_log_dirs=job_log_dirs)
|
|
387
|
+
except Exception as e: # pylint: disable=broad-except
|
|
388
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
392
|
+
):
|
|
393
|
+
"""Implementation of the ManagedJobsService gRPC service."""
|
|
394
|
+
|
|
395
|
+
def GetVersion( # type: ignore[return]
|
|
396
|
+
self, request: managed_jobsv1_pb2.GetVersionRequest,
|
|
397
|
+
context: grpc.ServicerContext
|
|
398
|
+
) -> managed_jobsv1_pb2.GetVersionResponse:
|
|
399
|
+
try:
|
|
400
|
+
return managed_jobsv1_pb2.GetVersionResponse(
|
|
401
|
+
controller_version=constants.SKYLET_VERSION)
|
|
402
|
+
except Exception as e: # pylint: disable=broad-except
|
|
403
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
404
|
+
|
|
405
|
+
def GetJobTable( # type: ignore[return]
|
|
406
|
+
self, request: managed_jobsv1_pb2.GetJobTableRequest,
|
|
407
|
+
context: grpc.ServicerContext
|
|
408
|
+
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
409
|
+
try:
|
|
410
|
+
accessible_workspaces = (
|
|
411
|
+
list(request.accessible_workspaces.workspaces)
|
|
412
|
+
if request.HasField('accessible_workspaces') else None)
|
|
413
|
+
job_ids = (list(request.job_ids.ids)
|
|
414
|
+
if request.HasField('job_ids') else None)
|
|
415
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
416
|
+
if request.HasField('user_hashes'):
|
|
417
|
+
user_hashes = list(request.user_hashes.hashes)
|
|
418
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
419
|
+
# user_hash. TODO: Remove before 0.12.0.
|
|
420
|
+
if request.show_jobs_without_user_hash:
|
|
421
|
+
user_hashes.append(None)
|
|
422
|
+
statuses = (list(request.statuses.statuses)
|
|
423
|
+
if request.HasField('statuses') else None)
|
|
424
|
+
fields = (list(request.fields.fields)
|
|
425
|
+
if request.HasField('fields') else None)
|
|
426
|
+
job_queue = managed_job_utils.get_managed_job_queue(
|
|
427
|
+
skip_finished=request.skip_finished,
|
|
428
|
+
accessible_workspaces=accessible_workspaces,
|
|
429
|
+
job_ids=job_ids,
|
|
430
|
+
workspace_match=request.workspace_match
|
|
431
|
+
if request.HasField('workspace_match') else None,
|
|
432
|
+
name_match=request.name_match
|
|
433
|
+
if request.HasField('name_match') else None,
|
|
434
|
+
pool_match=request.pool_match
|
|
435
|
+
if request.HasField('pool_match') else None,
|
|
436
|
+
page=request.page if request.HasField('page') else None,
|
|
437
|
+
limit=request.limit if request.HasField('limit') else None,
|
|
438
|
+
user_hashes=user_hashes,
|
|
439
|
+
statuses=statuses,
|
|
440
|
+
fields=fields,
|
|
441
|
+
)
|
|
442
|
+
jobs = job_queue['jobs']
|
|
443
|
+
total = job_queue['total']
|
|
444
|
+
total_no_filter = job_queue['total_no_filter']
|
|
445
|
+
status_counts = job_queue['status_counts']
|
|
446
|
+
|
|
447
|
+
jobs_info = []
|
|
448
|
+
for job in jobs:
|
|
449
|
+
converted_metadata = None
|
|
450
|
+
metadata = job.get('metadata')
|
|
451
|
+
if metadata:
|
|
452
|
+
converted_metadata = {
|
|
453
|
+
k: v for k, v in metadata.items() if v is not None
|
|
454
|
+
}
|
|
455
|
+
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
456
|
+
# The `spot.job_id`, which can be used to identify
|
|
457
|
+
# different tasks for the same job
|
|
458
|
+
_job_id=job.get('_job_id'),
|
|
459
|
+
job_id=job.get('job_id'),
|
|
460
|
+
task_id=job.get('task_id'),
|
|
461
|
+
job_name=job.get('job_name'),
|
|
462
|
+
task_name=job.get('task_name'),
|
|
463
|
+
job_duration=job.get('job_duration'),
|
|
464
|
+
workspace=job.get('workspace'),
|
|
465
|
+
status=managed_job_state.ManagedJobStatus(
|
|
466
|
+
job.get('status')).to_protobuf(),
|
|
467
|
+
schedule_state=managed_job_state.ManagedJobScheduleState(
|
|
468
|
+
job.get('schedule_state')).to_protobuf(),
|
|
469
|
+
resources=job.get('resources'),
|
|
470
|
+
cluster_resources=job.get('cluster_resources'),
|
|
471
|
+
cluster_resources_full=job.get('cluster_resources_full'),
|
|
472
|
+
cloud=job.get('cloud'),
|
|
473
|
+
region=job.get('region'),
|
|
474
|
+
infra=job.get('infra'),
|
|
475
|
+
accelerators=job.get('accelerators'),
|
|
476
|
+
recovery_count=job.get('recovery_count'),
|
|
477
|
+
details=job.get('details'),
|
|
478
|
+
failure_reason=job.get('failure_reason'),
|
|
479
|
+
user_name=job.get('user_name'),
|
|
480
|
+
user_hash=job.get('user_hash'),
|
|
481
|
+
submitted_at=job.get('submitted_at'),
|
|
482
|
+
start_at=job.get('start_at'),
|
|
483
|
+
end_at=job.get('end_at'),
|
|
484
|
+
user_yaml=job.get('user_yaml'),
|
|
485
|
+
entrypoint=job.get('entrypoint'),
|
|
486
|
+
metadata=converted_metadata,
|
|
487
|
+
pool=job.get('pool'),
|
|
488
|
+
pool_hash=job.get('pool_hash'))
|
|
489
|
+
jobs_info.append(job_info)
|
|
490
|
+
|
|
491
|
+
return managed_jobsv1_pb2.GetJobTableResponse(
|
|
492
|
+
jobs=jobs_info,
|
|
493
|
+
total=total,
|
|
494
|
+
total_no_filter=total_no_filter,
|
|
495
|
+
status_counts=status_counts)
|
|
496
|
+
except Exception as e: # pylint: disable=broad-except
|
|
497
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
498
|
+
|
|
499
|
+
def GetAllJobIdsByName( # type: ignore[return]
|
|
500
|
+
self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
|
|
501
|
+
context: grpc.ServicerContext
|
|
502
|
+
) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
|
|
503
|
+
try:
|
|
504
|
+
job_name = request.job_name if request.HasField(
|
|
505
|
+
'job_name') else None
|
|
506
|
+
job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
|
|
507
|
+
return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
|
|
508
|
+
job_ids=job_ids)
|
|
509
|
+
except Exception as e: # pylint: disable=broad-except
|
|
510
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
511
|
+
|
|
512
|
+
def CancelJobs( # type: ignore[return]
|
|
513
|
+
self, request: managed_jobsv1_pb2.CancelJobsRequest,
|
|
514
|
+
context: grpc.ServicerContext
|
|
515
|
+
) -> managed_jobsv1_pb2.CancelJobsResponse:
|
|
516
|
+
try:
|
|
517
|
+
cancellation_criteria = request.WhichOneof('cancellation_criteria')
|
|
518
|
+
if cancellation_criteria is None:
|
|
519
|
+
context.abort(
|
|
520
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
521
|
+
'exactly one cancellation criteria must be specified.')
|
|
522
|
+
|
|
523
|
+
if cancellation_criteria == 'all_users':
|
|
524
|
+
user_hash = request.user_hash if request.HasField(
|
|
525
|
+
'user_hash') else None
|
|
526
|
+
all_users = request.all_users
|
|
527
|
+
if not all_users and user_hash is None:
|
|
528
|
+
context.abort(
|
|
529
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
530
|
+
'user_hash is required when all_users is False')
|
|
531
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
532
|
+
job_ids=None,
|
|
533
|
+
all_users=all_users,
|
|
534
|
+
current_workspace=request.current_workspace,
|
|
535
|
+
user_hash=user_hash)
|
|
536
|
+
elif cancellation_criteria == 'job_ids':
|
|
537
|
+
job_ids = list(request.job_ids.ids)
|
|
538
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
539
|
+
job_ids=job_ids,
|
|
540
|
+
current_workspace=request.current_workspace)
|
|
541
|
+
elif cancellation_criteria == 'job_name':
|
|
542
|
+
message = managed_job_utils.cancel_job_by_name(
|
|
543
|
+
job_name=request.job_name,
|
|
544
|
+
current_workspace=request.current_workspace)
|
|
545
|
+
elif cancellation_criteria == 'pool_name':
|
|
546
|
+
message = managed_job_utils.cancel_jobs_by_pool(
|
|
547
|
+
pool_name=request.pool_name,
|
|
548
|
+
current_workspace=request.current_workspace)
|
|
549
|
+
else:
|
|
550
|
+
context.abort(
|
|
551
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
552
|
+
f'invalid cancellation criteria: {cancellation_criteria}')
|
|
553
|
+
return managed_jobsv1_pb2.CancelJobsResponse(message=message)
|
|
554
|
+
except Exception as e: # pylint: disable=broad-except
|
|
555
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
556
|
+
|
|
557
|
+
def StreamLogs(
|
|
558
|
+
self,
|
|
559
|
+
request: managed_jobsv1_pb2.
|
|
560
|
+
StreamLogsRequest, # type: ignore[return]
|
|
561
|
+
context: grpc.ServicerContext):
|
|
562
|
+
# TODO(kevin): implement this
|
|
563
|
+
context.abort(grpc.StatusCode.UNIMPLEMENTED,
|
|
564
|
+
'StreamLogs is not implemented')
|