skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -9,17 +9,22 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
|
|
|
9
9
|
be called from any code running on the managed jobs controller instance to
|
|
10
10
|
trigger scheduling of new jobs if possible. This function should be called
|
|
11
11
|
immediately after any state change that could result in jobs newly being able to
|
|
12
|
-
be scheduled.
|
|
12
|
+
be scheduled. If the job is running in a pool, the scheduler will only schedule
|
|
13
|
+
jobs for the same pool, because the resources limitations are per-pool (see the
|
|
14
|
+
following section for more details).
|
|
13
15
|
|
|
14
|
-
The scheduling logic limits
|
|
16
|
+
The scheduling logic limits #running jobs according to three limits:
|
|
15
17
|
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
|
16
|
-
once, based on the number of CPUs.
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
once, based on the number of CPUs. This the most compute-intensive part of
|
|
19
|
+
the job lifecycle, which is why we have an additional limit.
|
|
20
|
+
See sky/utils/controller_utils.py::_get_launch_parallelism.
|
|
19
21
|
2. The number of jobs that can be running at any given time, based on the amount
|
|
20
|
-
of memory.
|
|
21
|
-
|
|
22
|
-
|
|
22
|
+
of memory. Since the job controller is doing very little once a job starts
|
|
23
|
+
(just checking its status periodically), the most significant resource it
|
|
24
|
+
consumes is memory.
|
|
25
|
+
See sky/utils/controller_utils.py::_get_job_parallelism.
|
|
26
|
+
3. The number of jobs that can be running in a pool at any given time, based on
|
|
27
|
+
the number of ready workers in the pool. (See _can_start_new_job.)
|
|
23
28
|
|
|
24
29
|
The state of the scheduler is entirely determined by the schedule_state column
|
|
25
30
|
of all the jobs in the job_info table. This column should only be modified via
|
|
@@ -37,152 +42,220 @@ Nomenclature:
|
|
|
37
42
|
"""
|
|
38
43
|
|
|
39
44
|
from argparse import ArgumentParser
|
|
45
|
+
import asyncio
|
|
40
46
|
import contextlib
|
|
41
|
-
from functools import lru_cache
|
|
42
47
|
import os
|
|
43
|
-
import
|
|
48
|
+
import pathlib
|
|
49
|
+
import shutil
|
|
50
|
+
import sys
|
|
44
51
|
import typing
|
|
52
|
+
from typing import List, Optional, Set
|
|
53
|
+
import uuid
|
|
45
54
|
|
|
46
55
|
import filelock
|
|
47
56
|
|
|
48
57
|
from sky import sky_logging
|
|
58
|
+
from sky import skypilot_config
|
|
49
59
|
from sky.adaptors import common as adaptors_common
|
|
60
|
+
from sky.client import sdk
|
|
50
61
|
from sky.jobs import constants as managed_job_constants
|
|
51
62
|
from sky.jobs import state
|
|
63
|
+
from sky.jobs import utils as managed_job_utils
|
|
52
64
|
from sky.skylet import constants
|
|
53
|
-
from sky.utils import
|
|
65
|
+
from sky.utils import controller_utils
|
|
54
66
|
from sky.utils import subprocess_utils
|
|
55
67
|
|
|
56
68
|
if typing.TYPE_CHECKING:
|
|
69
|
+
import logging
|
|
70
|
+
|
|
57
71
|
import psutil
|
|
58
72
|
else:
|
|
59
73
|
psutil = adaptors_common.LazyImport('psutil')
|
|
60
74
|
|
|
61
75
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
62
76
|
|
|
63
|
-
#
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
77
|
+
# Job controller lock. This is used to synchronize writing/reading the
|
|
78
|
+
# controller pid file.
|
|
79
|
+
JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
|
|
80
|
+
'~/.sky/locks/job_controller_pid.lock')
|
|
81
|
+
|
|
82
|
+
JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
|
|
83
|
+
JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
|
|
84
|
+
|
|
85
|
+
CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _parse_controller_pid_entry(
|
|
89
|
+
entry: str) -> Optional[state.ControllerPidRecord]:
|
|
90
|
+
entry = entry.strip()
|
|
91
|
+
if not entry:
|
|
92
|
+
return None
|
|
93
|
+
# The entry should be like <pid>,<started_at>
|
|
94
|
+
# pid is an integer, started_at is a float
|
|
95
|
+
# For backwards compatibility, we also support just <pid>
|
|
96
|
+
entry_parts = entry.split(',')
|
|
97
|
+
if len(entry_parts) == 2:
|
|
98
|
+
[raw_pid, raw_started_at] = entry_parts
|
|
99
|
+
elif len(entry_parts) == 1:
|
|
100
|
+
# Backwards compatibility, pre-#7847
|
|
101
|
+
# TODO(cooperc): Remove for 0.13.0
|
|
102
|
+
raw_pid = entry_parts[0]
|
|
103
|
+
raw_started_at = None
|
|
104
|
+
else:
|
|
105
|
+
# Unknown format
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
pid = int(raw_pid)
|
|
110
|
+
except ValueError:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
started_at: Optional[float] = None
|
|
114
|
+
if raw_started_at:
|
|
115
|
+
try:
|
|
116
|
+
started_at = float(raw_started_at)
|
|
117
|
+
except ValueError:
|
|
118
|
+
started_at = None
|
|
119
|
+
return state.ControllerPidRecord(pid=pid, started_at=started_at)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_controller_process_records(
|
|
123
|
+
) -> Optional[List[state.ControllerPidRecord]]:
|
|
124
|
+
"""Return recorded controller processes if the file can be read."""
|
|
125
|
+
if not os.path.exists(JOB_CONTROLLER_PID_PATH):
|
|
126
|
+
# If the file doesn't exist, it means the controller server is not
|
|
127
|
+
# running, so we return an empty list
|
|
128
|
+
return []
|
|
129
|
+
try:
|
|
130
|
+
with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
|
|
131
|
+
lines = f.read().splitlines()
|
|
132
|
+
except (FileNotFoundError, OSError):
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
records: List[state.ControllerPidRecord] = []
|
|
136
|
+
for line in lines:
|
|
137
|
+
record = _parse_controller_pid_entry(line)
|
|
138
|
+
if record is not None:
|
|
139
|
+
records.append(record)
|
|
140
|
+
return records
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _append_controller_pid_record(pid: int,
|
|
144
|
+
started_at: Optional[float]) -> None:
|
|
145
|
+
# Note: started_at is a float, but converting to a string will not lose any
|
|
146
|
+
# precision. See https://docs.python.org/3/tutorial/floatingpoint.html and
|
|
147
|
+
# https://github.com/python/cpython/issues/53583
|
|
148
|
+
entry = str(pid) if started_at is None else f'{pid},{started_at}'
|
|
149
|
+
with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
|
|
150
|
+
f.write(entry + '\n')
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def start_controller() -> None:
|
|
154
|
+
"""Start the job controller process.
|
|
155
|
+
|
|
156
|
+
This requires that the env file is already set up.
|
|
157
|
+
"""
|
|
158
|
+
os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
|
|
159
|
+
logs_dir = os.path.expanduser(
|
|
160
|
+
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
161
|
+
os.makedirs(logs_dir, exist_ok=True)
|
|
162
|
+
controller_uuid = str(uuid.uuid4())
|
|
163
|
+
log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
|
|
164
|
+
|
|
165
|
+
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
166
|
+
run_controller_cmd = (f'{sys.executable} -u -m'
|
|
167
|
+
f'sky.jobs.controller {controller_uuid}')
|
|
168
|
+
|
|
169
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
|
170
|
+
f'{run_controller_cmd}')
|
|
171
|
+
|
|
172
|
+
logger.info(f'Running controller with command: {run_cmd}')
|
|
173
|
+
|
|
174
|
+
pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
|
|
175
|
+
pid_started_at = psutil.Process(pid).create_time()
|
|
176
|
+
_append_controller_pid_record(pid, pid_started_at)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_alive_controllers() -> Optional[int]:
|
|
180
|
+
records = get_controller_process_records()
|
|
181
|
+
if records is None:
|
|
182
|
+
# If we cannot read the file reliably, avoid starting extra controllers.
|
|
183
|
+
return None
|
|
184
|
+
if not records:
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
alive = 0
|
|
188
|
+
for record in records:
|
|
189
|
+
if managed_job_utils.controller_process_alive(record, quiet=False):
|
|
190
|
+
alive += 1
|
|
191
|
+
return alive
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def maybe_start_controllers(from_scheduler: bool = False) -> None:
|
|
195
|
+
"""Start the job controller process.
|
|
196
|
+
|
|
197
|
+
If the process is already running, it will not start a new one.
|
|
198
|
+
Will also add the job_id, dag_yaml_path, and env_file_path to the
|
|
199
|
+
controllers list of processes.
|
|
116
200
|
"""
|
|
201
|
+
# In consolidation mode, during rolling update, two API servers may be
|
|
202
|
+
# running. If we are on the new API server, and we haven't finished the
|
|
203
|
+
# recovery process, we should avoid starting new controllers. The old API
|
|
204
|
+
# server/consolidated jobs controller could run update_managed_jobs_statuses
|
|
205
|
+
# and if there are jobs running on the new API server, the old one will not
|
|
206
|
+
# see the corresponding processes and may mark them as FAILED_CONTROLLER.
|
|
207
|
+
if from_scheduler and managed_job_utils.is_consolidation_mode(
|
|
208
|
+
) and os.path.exists(
|
|
209
|
+
os.path.expanduser(
|
|
210
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
|
211
|
+
# This could happen during an API server rolling update, or during
|
|
212
|
+
# normal running while managed-job-status-refresh-daemon is running. In
|
|
213
|
+
# either case, the controllers should be already started or will be
|
|
214
|
+
# started by the recovery process.
|
|
215
|
+
logger.info('Recovery is still in progress, skipping controller start.')
|
|
216
|
+
return
|
|
117
217
|
try:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
|
160
|
-
|
|
161
|
-
activate_python_env_cmd = (
|
|
162
|
-
f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
163
|
-
env_file = maybe_next_job['env_file_path']
|
|
164
|
-
source_environment_cmd = (f'source {env_file};'
|
|
165
|
-
if env_file else '')
|
|
166
|
-
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
|
167
|
-
f'{dag_yaml_path} --job-id {job_id};')
|
|
168
|
-
|
|
169
|
-
# If the command line here is changed, please also update
|
|
170
|
-
# utils._controller_process_alive. `--job-id X` should be at
|
|
171
|
-
# the end.
|
|
172
|
-
run_cmd = (f'{activate_python_env_cmd}'
|
|
173
|
-
f'{source_environment_cmd}'
|
|
174
|
-
f'{run_controller_cmd}')
|
|
175
|
-
|
|
176
|
-
logs_dir = os.path.expanduser(
|
|
177
|
-
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
178
|
-
os.makedirs(logs_dir, exist_ok=True)
|
|
179
|
-
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
|
180
|
-
|
|
181
|
-
pid = subprocess_utils.launch_new_process_tree(
|
|
182
|
-
run_cmd, log_output=log_path)
|
|
183
|
-
state.set_job_controller_pid(job_id, pid)
|
|
184
|
-
|
|
185
|
-
logger.debug(f'Job {job_id} started with pid {pid}')
|
|
218
|
+
with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
|
|
219
|
+
if from_scheduler and not managed_job_utils.is_consolidation_mode():
|
|
220
|
+
cur = pathlib.Path(CURRENT_HASH)
|
|
221
|
+
old = pathlib.Path(f'{CURRENT_HASH}.old')
|
|
222
|
+
|
|
223
|
+
if old.exists() and cur.exists():
|
|
224
|
+
if (old.read_text(encoding='utf-8') !=
|
|
225
|
+
cur.read_text(encoding='utf-8')):
|
|
226
|
+
# TODO(luca): there is a 1/2^160 chance that there will
|
|
227
|
+
# be a collision. using a geometric distribution and
|
|
228
|
+
# assuming one update a day, we expect a bug slightly
|
|
229
|
+
# before the heat death of the universe. should get
|
|
230
|
+
# this fixed before then.
|
|
231
|
+
try:
|
|
232
|
+
# this will stop all the controllers and the api
|
|
233
|
+
# server.
|
|
234
|
+
sdk.api_stop()
|
|
235
|
+
# All controllers should be dead. Remove the PIDs so
|
|
236
|
+
# that update_managed_jobs_statuses won't think they
|
|
237
|
+
# have failed.
|
|
238
|
+
state.reset_jobs_for_recovery()
|
|
239
|
+
except Exception as e: # pylint: disable=broad-except
|
|
240
|
+
logger.error(f'Failed to stop the api server: {e}')
|
|
241
|
+
pass
|
|
242
|
+
else:
|
|
243
|
+
shutil.copyfile(cur, old)
|
|
244
|
+
if not old.exists():
|
|
245
|
+
shutil.copyfile(cur, old)
|
|
246
|
+
|
|
247
|
+
alive = get_alive_controllers()
|
|
248
|
+
if alive is None:
|
|
249
|
+
return
|
|
250
|
+
wanted = controller_utils.get_number_of_jobs_controllers()
|
|
251
|
+
started = 0
|
|
252
|
+
|
|
253
|
+
while alive + started < wanted:
|
|
254
|
+
start_controller()
|
|
255
|
+
started += 1
|
|
256
|
+
|
|
257
|
+
if started > 0:
|
|
258
|
+
logger.info(f'Started {started} controllers')
|
|
186
259
|
|
|
187
260
|
except filelock.Timeout:
|
|
188
261
|
# If we can't get the lock, just exit. The process holding the lock
|
|
@@ -190,24 +263,64 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
190
263
|
pass
|
|
191
264
|
|
|
192
265
|
|
|
193
|
-
def submit_job(job_id: int, dag_yaml_path: str,
|
|
266
|
+
def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
267
|
+
env_file_path: str, priority: int) -> None:
|
|
194
268
|
"""Submit an existing job to the scheduler.
|
|
195
269
|
|
|
196
270
|
This should be called after a job is created in the `spot` table as
|
|
197
271
|
PENDING. It will tell the scheduler to try and start the job controller, if
|
|
198
|
-
there are resources available.
|
|
199
|
-
should not be on the critical path for `sky jobs launch -d`.
|
|
272
|
+
there are resources available.
|
|
200
273
|
|
|
201
274
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
|
202
275
|
"""
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
276
|
+
controller_process = state.get_job_controller_process(job_id)
|
|
277
|
+
if controller_process is not None:
|
|
278
|
+
# why? TODO(cooperc): figure out why this is needed, fix it, and remove
|
|
279
|
+
if managed_job_utils.controller_process_alive(controller_process,
|
|
280
|
+
job_id):
|
|
281
|
+
# This can happen when HA recovery runs for some reason but the job
|
|
282
|
+
# controller is still alive.
|
|
283
|
+
logger.warning(f'Job {job_id} is still alive with controller '
|
|
284
|
+
f'{controller_process}, skipping submission')
|
|
285
|
+
maybe_start_controllers(from_scheduler=True)
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
|
|
289
|
+
dag_yaml_content = dag_file.read()
|
|
290
|
+
with open(original_user_yaml_path, 'r',
|
|
291
|
+
encoding='utf-8') as original_user_yaml_file:
|
|
292
|
+
original_user_yaml_content = original_user_yaml_file.read()
|
|
293
|
+
with open(env_file_path, 'r', encoding='utf-8') as env_file:
|
|
294
|
+
env_file_content = env_file.read()
|
|
295
|
+
|
|
296
|
+
# Read config file if SKYPILOT_CONFIG env var is set
|
|
297
|
+
config_file_content: Optional[str] = None
|
|
298
|
+
config_file_path = os.environ.get(skypilot_config.ENV_VAR_SKYPILOT_CONFIG)
|
|
299
|
+
if config_file_path:
|
|
300
|
+
config_file_path = os.path.expanduser(config_file_path)
|
|
301
|
+
if os.path.exists(config_file_path):
|
|
302
|
+
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
|
303
|
+
config_file_content = config_file.read()
|
|
304
|
+
|
|
305
|
+
config_bytes = (len(config_file_content) if config_file_content else 0)
|
|
306
|
+
logger.debug(f'Storing job {job_id} file contents in database '
|
|
307
|
+
f'(DAG bytes={len(dag_yaml_content)}, '
|
|
308
|
+
f'original user yaml bytes={len(original_user_yaml_content)}, '
|
|
309
|
+
f'env bytes={len(env_file_content)}, '
|
|
310
|
+
f'config bytes={config_bytes}).')
|
|
311
|
+
state.scheduler_set_waiting(job_id, dag_yaml_content,
|
|
312
|
+
original_user_yaml_content, env_file_content,
|
|
313
|
+
config_file_content, priority)
|
|
314
|
+
maybe_start_controllers(from_scheduler=True)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
@contextlib.asynccontextmanager
|
|
318
|
+
async def scheduled_launch(
|
|
319
|
+
job_id: int,
|
|
320
|
+
starting: Set[int],
|
|
321
|
+
starting_lock: asyncio.Lock,
|
|
322
|
+
starting_signal: asyncio.Condition,
|
|
323
|
+
):
|
|
211
324
|
"""Launch as part of an ongoing job.
|
|
212
325
|
|
|
213
326
|
A newly started job will already be LAUNCHING, and this will immediately
|
|
@@ -228,23 +341,42 @@ def scheduled_launch(job_id: int):
|
|
|
228
341
|
multiple uses of this context are nested, behavior is undefined. Don't do
|
|
229
342
|
that.
|
|
230
343
|
"""
|
|
344
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
345
|
+
# For pool, since there is no execution.launch, we don't need to have all
|
|
346
|
+
# the ALIVE_WAITING state. The state transition will be
|
|
347
|
+
# WAITING -> ALIVE -> DONE without any intermediate transitions.
|
|
348
|
+
if pool is not None:
|
|
349
|
+
yield
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
|
|
353
|
+
'starting_lock and starting_signal must use the same lock')
|
|
231
354
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
355
|
+
while True:
|
|
356
|
+
async with starting_lock:
|
|
357
|
+
starting_count = len(starting)
|
|
358
|
+
if starting_count < controller_utils.LAUNCHES_PER_WORKER:
|
|
359
|
+
break
|
|
360
|
+
logger.info('Too many jobs starting, waiting for a slot')
|
|
361
|
+
await starting_signal.wait()
|
|
238
362
|
|
|
239
|
-
|
|
240
|
-
state.ManagedJobScheduleState.LAUNCHING):
|
|
241
|
-
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
|
363
|
+
logger.info(f'Starting job {job_id}')
|
|
242
364
|
|
|
243
|
-
|
|
365
|
+
async with starting_lock:
|
|
366
|
+
starting.add(job_id)
|
|
244
367
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
368
|
+
await state.scheduler_set_launching_async(job_id)
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
yield
|
|
372
|
+
except Exception as e:
|
|
373
|
+
raise e
|
|
374
|
+
else:
|
|
375
|
+
await state.scheduler_set_alive_async(job_id)
|
|
376
|
+
finally:
|
|
377
|
+
async with starting_lock:
|
|
378
|
+
starting.remove(job_id)
|
|
379
|
+
starting_signal.notify()
|
|
248
380
|
|
|
249
381
|
|
|
250
382
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -255,46 +387,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
255
387
|
|
|
256
388
|
The job could be in any terminal ManagedJobStatus. However, once DONE, it
|
|
257
389
|
should never transition back to another state.
|
|
390
|
+
|
|
391
|
+
This is only called by utils.update_managed_jobs_statuses which is sync.
|
|
258
392
|
"""
|
|
259
393
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
260
394
|
== state.ManagedJobScheduleState.DONE):
|
|
261
395
|
return
|
|
262
396
|
|
|
263
|
-
|
|
264
|
-
state.scheduler_set_done(job_id, idempotent)
|
|
265
|
-
maybe_schedule_next_jobs()
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
def _set_alive_waiting(job_id: int) -> None:
|
|
269
|
-
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
270
|
-
with filelock.FileLock(_get_lock_path()):
|
|
271
|
-
state.scheduler_set_alive_waiting(job_id)
|
|
272
|
-
maybe_schedule_next_jobs()
|
|
273
|
-
|
|
397
|
+
state.scheduler_set_done(job_id, idempotent)
|
|
274
398
|
|
|
275
|
-
def _get_job_parallelism() -> int:
|
|
276
|
-
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
|
277
|
-
|
|
278
|
-
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
279
|
-
|
|
280
|
-
return max(job_limit, 1)
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
def _get_launch_parallelism() -> int:
|
|
284
|
-
cpus = os.cpu_count()
|
|
285
|
-
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
def _can_start_new_job() -> bool:
|
|
289
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
290
|
-
alive_jobs = state.get_num_alive_jobs()
|
|
291
|
-
return launching_jobs < _get_launch_parallelism(
|
|
292
|
-
) and alive_jobs < _get_job_parallelism()
|
|
293
399
|
|
|
400
|
+
async def job_done_async(job_id: int, idempotent: bool = False):
|
|
401
|
+
"""Async version of job_done."""
|
|
402
|
+
if idempotent and (await state.get_job_schedule_state_async(job_id)
|
|
403
|
+
== state.ManagedJobScheduleState.DONE):
|
|
404
|
+
return
|
|
294
405
|
|
|
295
|
-
|
|
296
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
297
|
-
return launching_jobs < _get_launch_parallelism()
|
|
406
|
+
await state.scheduler_set_done_async(job_id, idempotent)
|
|
298
407
|
|
|
299
408
|
|
|
300
409
|
if __name__ == '__main__':
|
|
@@ -302,6 +411,9 @@ if __name__ == '__main__':
|
|
|
302
411
|
parser.add_argument('dag_yaml',
|
|
303
412
|
type=str,
|
|
304
413
|
help='The path to the user job yaml file.')
|
|
414
|
+
parser.add_argument('--user-yaml-path',
|
|
415
|
+
type=str,
|
|
416
|
+
help='The path to the original user job yaml file.')
|
|
305
417
|
parser.add_argument('--job-id',
|
|
306
418
|
required=True,
|
|
307
419
|
type=int,
|
|
@@ -309,5 +421,18 @@ if __name__ == '__main__':
|
|
|
309
421
|
parser.add_argument('--env-file',
|
|
310
422
|
type=str,
|
|
311
423
|
help='The path to the controller env file.')
|
|
424
|
+
parser.add_argument('--pool',
|
|
425
|
+
type=str,
|
|
426
|
+
required=False,
|
|
427
|
+
default=None,
|
|
428
|
+
help='The pool to use for the controller job.')
|
|
429
|
+
parser.add_argument(
|
|
430
|
+
'--priority',
|
|
431
|
+
type=int,
|
|
432
|
+
default=constants.DEFAULT_PRIORITY,
|
|
433
|
+
help=
|
|
434
|
+
f'Job priority ({constants.MIN_PRIORITY} to {constants.MAX_PRIORITY}).'
|
|
435
|
+
f' Default: {constants.DEFAULT_PRIORITY}.')
|
|
312
436
|
args = parser.parse_args()
|
|
313
|
-
submit_job(args.job_id, args.dag_yaml, args.env_file
|
|
437
|
+
submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
|
|
438
|
+
args.priority)
|