skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/job_lib.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
This is a remote utility module that provides job queue functionality.
|
|
4
4
|
"""
|
|
5
5
|
import enum
|
|
6
|
+
import functools
|
|
6
7
|
import getpass
|
|
7
8
|
import json
|
|
8
9
|
import os
|
|
@@ -10,9 +11,10 @@ import pathlib
|
|
|
10
11
|
import shlex
|
|
11
12
|
import signal
|
|
12
13
|
import sqlite3
|
|
14
|
+
import threading
|
|
13
15
|
import time
|
|
14
16
|
import typing
|
|
15
|
-
from typing import Any, Dict, List, Optional, Sequence
|
|
17
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
16
18
|
|
|
17
19
|
import colorama
|
|
18
20
|
import filelock
|
|
@@ -21,20 +23,22 @@ from sky import global_user_state
|
|
|
21
23
|
from sky import sky_logging
|
|
22
24
|
from sky.adaptors import common as adaptors_common
|
|
23
25
|
from sky.skylet import constants
|
|
26
|
+
from sky.skylet import runtime_utils
|
|
24
27
|
from sky.utils import common_utils
|
|
25
|
-
from sky.utils import db_utils
|
|
26
|
-
from sky.utils import log_utils
|
|
27
28
|
from sky.utils import message_utils
|
|
28
29
|
from sky.utils import subprocess_utils
|
|
30
|
+
from sky.utils.db import db_utils
|
|
29
31
|
|
|
30
32
|
if typing.TYPE_CHECKING:
|
|
31
33
|
import psutil
|
|
34
|
+
|
|
35
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
32
36
|
else:
|
|
33
37
|
psutil = adaptors_common.LazyImport('psutil')
|
|
38
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
34
39
|
|
|
35
40
|
logger = sky_logging.init_logger(__name__)
|
|
36
41
|
|
|
37
|
-
_LINUX_NEW_LINE = '\n'
|
|
38
42
|
_JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
|
|
39
43
|
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
|
40
44
|
# with pid is the same driver process to guard against the case where
|
|
@@ -60,10 +64,8 @@ class JobInfoLoc(enum.IntEnum):
|
|
|
60
64
|
END_AT = 7
|
|
61
65
|
RESOURCES = 8
|
|
62
66
|
PID = 9
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
_DB_PATH = os.path.expanduser('~/.sky/jobs.db')
|
|
66
|
-
os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
|
|
67
|
+
LOG_PATH = 10
|
|
68
|
+
METADATA = 11
|
|
67
69
|
|
|
68
70
|
|
|
69
71
|
def create_table(cursor, conn):
|
|
@@ -82,13 +84,9 @@ def create_table(cursor, conn):
|
|
|
82
84
|
# is not critical and is likely to be enabled by other processes.
|
|
83
85
|
|
|
84
86
|
# Pid column is used for keeping track of the driver process of a job. It
|
|
85
|
-
# can be in
|
|
86
|
-
# -1: The job was submitted with SkyPilot older than #4318, where we use
|
|
87
|
-
# ray job submit to submit the job, i.e. no pid is recorded. This is for
|
|
88
|
-
# backward compatibility and should be removed after 0.10.0.
|
|
87
|
+
# can be in two states:
|
|
89
88
|
# 0: The job driver process has never been started. When adding a job with
|
|
90
|
-
# INIT state, the pid will be set to 0
|
|
91
|
-
# backward compatibility).
|
|
89
|
+
# INIT state, the pid will be set to 0.
|
|
92
90
|
# >=0: The job has been started. The pid is the driver process's pid.
|
|
93
91
|
# The driver can be actually running or finished.
|
|
94
92
|
# TODO(SKY-1213): username is actually user hash, should rename.
|
|
@@ -103,7 +101,9 @@ def create_table(cursor, conn):
|
|
|
103
101
|
start_at FLOAT DEFAULT -1,
|
|
104
102
|
end_at FLOAT DEFAULT NULL,
|
|
105
103
|
resources TEXT DEFAULT NULL,
|
|
106
|
-
pid INTEGER DEFAULT -1
|
|
104
|
+
pid INTEGER DEFAULT -1,
|
|
105
|
+
log_dir TEXT DEFAULT NULL,
|
|
106
|
+
metadata TEXT DEFAULT '{}')""")
|
|
107
107
|
|
|
108
108
|
cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
|
|
109
109
|
job_id INTEGER,
|
|
@@ -116,12 +116,38 @@ def create_table(cursor, conn):
|
|
|
116
116
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
|
|
117
117
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
|
|
118
118
|
'INTEGER DEFAULT -1')
|
|
119
|
+
db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
|
|
120
|
+
'TEXT DEFAULT NULL')
|
|
121
|
+
db_utils.add_column_to_table(cursor,
|
|
122
|
+
conn,
|
|
123
|
+
'jobs',
|
|
124
|
+
'metadata',
|
|
125
|
+
'TEXT DEFAULT \'{}\'',
|
|
126
|
+
value_to_replace_existing_entries='{}')
|
|
119
127
|
conn.commit()
|
|
120
128
|
|
|
121
129
|
|
|
122
|
-
_DB =
|
|
123
|
-
|
|
124
|
-
|
|
130
|
+
_DB = None
|
|
131
|
+
_db_init_lock = threading.Lock()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def init_db(func):
|
|
135
|
+
"""Initialize the database."""
|
|
136
|
+
|
|
137
|
+
@functools.wraps(func)
|
|
138
|
+
def wrapper(*args, **kwargs):
|
|
139
|
+
global _DB
|
|
140
|
+
if _DB is not None:
|
|
141
|
+
return func(*args, **kwargs)
|
|
142
|
+
|
|
143
|
+
with _db_init_lock:
|
|
144
|
+
if _DB is None:
|
|
145
|
+
db_path = runtime_utils.get_runtime_dir_path('.sky/jobs.db')
|
|
146
|
+
os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
|
|
147
|
+
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
148
|
+
return func(*args, **kwargs)
|
|
149
|
+
|
|
150
|
+
return wrapper
|
|
125
151
|
|
|
126
152
|
|
|
127
153
|
class JobStatus(enum.Enum):
|
|
@@ -192,6 +218,45 @@ class JobStatus(enum.Enum):
|
|
|
192
218
|
color = _JOB_STATUS_TO_COLOR[self]
|
|
193
219
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
194
220
|
|
|
221
|
+
@classmethod
|
|
222
|
+
def from_protobuf(
|
|
223
|
+
cls,
|
|
224
|
+
protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
|
|
225
|
+
"""Convert protobuf JobStatus enum to Python enum value."""
|
|
226
|
+
protobuf_to_enum = {
|
|
227
|
+
jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
|
|
228
|
+
jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
|
|
229
|
+
jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
|
|
230
|
+
jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
|
|
231
|
+
jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
|
|
232
|
+
jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
233
|
+
jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
|
|
234
|
+
jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
|
|
235
|
+
jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
236
|
+
jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
|
|
237
|
+
}
|
|
238
|
+
if protobuf_value not in protobuf_to_enum:
|
|
239
|
+
raise ValueError(
|
|
240
|
+
f'Unknown protobuf JobStatus value: {protobuf_value}')
|
|
241
|
+
return protobuf_to_enum[protobuf_value]
|
|
242
|
+
|
|
243
|
+
def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
|
|
244
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
245
|
+
enum_to_protobuf = {
|
|
246
|
+
JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
|
|
247
|
+
JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
|
|
248
|
+
JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
|
|
249
|
+
JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
|
|
250
|
+
JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
|
|
251
|
+
JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
|
|
252
|
+
JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
|
|
253
|
+
JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
|
|
254
|
+
JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
|
|
255
|
+
}
|
|
256
|
+
if self not in enum_to_protobuf:
|
|
257
|
+
raise ValueError(f'Unknown JobStatus value: {self}')
|
|
258
|
+
return enum_to_protobuf[self]
|
|
259
|
+
|
|
195
260
|
|
|
196
261
|
# We have two steps for job submissions:
|
|
197
262
|
# 1. Client reserve a job id from the job table by adding a INIT state job.
|
|
@@ -210,30 +275,33 @@ _PRE_RESOURCE_STATUSES = [JobStatus.PENDING]
|
|
|
210
275
|
class JobScheduler:
|
|
211
276
|
"""Base class for job scheduler"""
|
|
212
277
|
|
|
278
|
+
@init_db
|
|
213
279
|
def queue(self, job_id: int, cmd: str) -> None:
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
280
|
+
assert _DB is not None
|
|
281
|
+
_DB.cursor.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)',
|
|
282
|
+
(job_id, cmd, 0, int(time.time())))
|
|
283
|
+
_DB.conn.commit()
|
|
217
284
|
set_status(job_id, JobStatus.PENDING)
|
|
218
285
|
self.schedule_step()
|
|
219
286
|
|
|
287
|
+
@init_db
|
|
220
288
|
def remove_job_no_lock(self, job_id: int) -> None:
|
|
221
|
-
|
|
222
|
-
|
|
289
|
+
assert _DB is not None
|
|
290
|
+
_DB.cursor.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}')
|
|
291
|
+
_DB.conn.commit()
|
|
223
292
|
|
|
293
|
+
@init_db
|
|
224
294
|
def _run_job(self, job_id: int, run_cmd: str):
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
295
|
+
assert _DB is not None
|
|
296
|
+
_DB.cursor.execute(
|
|
297
|
+
(f'UPDATE pending_jobs SET submit={int(time.time())} '
|
|
298
|
+
f'WHERE job_id={job_id!r}'))
|
|
299
|
+
_DB.conn.commit()
|
|
228
300
|
pid = subprocess_utils.launch_new_process_tree(run_cmd)
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
pid = -1
|
|
234
|
-
_CURSOR.execute((f'UPDATE jobs SET pid={pid} '
|
|
235
|
-
f'WHERE job_id={job_id!r}'))
|
|
236
|
-
_CONN.commit()
|
|
301
|
+
|
|
302
|
+
_DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
|
|
303
|
+
f'WHERE job_id={job_id!r}'))
|
|
304
|
+
_DB.conn.commit()
|
|
237
305
|
|
|
238
306
|
def schedule_step(self, force_update_jobs: bool = False) -> None:
|
|
239
307
|
if force_update_jobs:
|
|
@@ -282,8 +350,10 @@ class JobScheduler:
|
|
|
282
350
|
class FIFOScheduler(JobScheduler):
|
|
283
351
|
"""First in first out job scheduler"""
|
|
284
352
|
|
|
353
|
+
@init_db
|
|
285
354
|
def _get_pending_job_ids(self) -> List[int]:
|
|
286
|
-
|
|
355
|
+
assert _DB is not None
|
|
356
|
+
rows = _DB.cursor.execute(
|
|
287
357
|
'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
|
|
288
358
|
return [row[0] for row in rows]
|
|
289
359
|
|
|
@@ -308,26 +378,67 @@ def make_job_command_with_user_switching(username: str,
|
|
|
308
378
|
return ['sudo', '-H', 'su', '--login', username, '-c', command]
|
|
309
379
|
|
|
310
380
|
|
|
311
|
-
|
|
312
|
-
|
|
381
|
+
@init_db
|
|
382
|
+
def add_job(job_name: str,
|
|
383
|
+
username: str,
|
|
384
|
+
run_timestamp: str,
|
|
385
|
+
resources_str: str,
|
|
386
|
+
metadata: str = '{}') -> Tuple[int, str]:
|
|
313
387
|
"""Atomically reserve the next available job id for the user."""
|
|
388
|
+
assert _DB is not None
|
|
314
389
|
job_submitted_at = time.time()
|
|
315
390
|
# job_id will autoincrement with the null value
|
|
316
|
-
|
|
317
|
-
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
|
|
391
|
+
_DB.cursor.execute(
|
|
392
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
|
|
318
393
|
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
|
319
|
-
run_timestamp, None, resources_str))
|
|
320
|
-
|
|
321
|
-
rows =
|
|
322
|
-
|
|
394
|
+
run_timestamp, None, resources_str, metadata))
|
|
395
|
+
_DB.conn.commit()
|
|
396
|
+
rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
|
|
397
|
+
(run_timestamp,))
|
|
323
398
|
for row in rows:
|
|
324
399
|
job_id = row[0]
|
|
325
400
|
assert job_id is not None
|
|
326
|
-
|
|
401
|
+
log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, f'{job_id}-{job_name}')
|
|
402
|
+
set_log_dir_no_lock(job_id, log_dir)
|
|
403
|
+
return job_id, log_dir
|
|
327
404
|
|
|
328
405
|
|
|
406
|
+
@init_db
|
|
407
|
+
def set_log_dir_no_lock(job_id: int, log_dir: str) -> None:
|
|
408
|
+
"""Set the log directory for the job.
|
|
409
|
+
|
|
410
|
+
We persist the log directory for the job to allow changing the log directory
|
|
411
|
+
generation logic over versions.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
job_id: The ID of the job.
|
|
415
|
+
log_dir: The log directory for the job.
|
|
416
|
+
"""
|
|
417
|
+
assert _DB is not None
|
|
418
|
+
_DB.cursor.execute('UPDATE jobs SET log_dir=(?) WHERE job_id=(?)',
|
|
419
|
+
(log_dir, job_id))
|
|
420
|
+
_DB.conn.commit()
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
@init_db
|
|
424
|
+
def get_log_dir_for_job(job_id: int) -> Optional[str]:
|
|
425
|
+
"""Get the log directory for the job.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
job_id: The ID of the job.
|
|
429
|
+
"""
|
|
430
|
+
assert _DB is not None
|
|
431
|
+
rows = _DB.cursor.execute('SELECT log_dir FROM jobs WHERE job_id=(?)',
|
|
432
|
+
(job_id,))
|
|
433
|
+
for row in rows:
|
|
434
|
+
return row[0]
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
@init_db
|
|
329
439
|
def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
|
|
330
440
|
"""Setting the status of the job in the database."""
|
|
441
|
+
assert _DB is not None
|
|
331
442
|
assert status != JobStatus.RUNNING, (
|
|
332
443
|
'Please use set_job_started() to set job status to RUNNING')
|
|
333
444
|
if status.is_terminal():
|
|
@@ -339,15 +450,15 @@ def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
|
|
|
339
450
|
check_end_at_str = ' AND end_at IS NULL'
|
|
340
451
|
if status != JobStatus.FAILED_SETUP:
|
|
341
452
|
check_end_at_str = ''
|
|
342
|
-
|
|
453
|
+
_DB.cursor.execute(
|
|
343
454
|
'UPDATE jobs SET status=(?), end_at=(?) '
|
|
344
455
|
f'WHERE job_id=(?) {check_end_at_str}',
|
|
345
456
|
(status.value, end_at, job_id))
|
|
346
457
|
else:
|
|
347
|
-
|
|
458
|
+
_DB.cursor.execute(
|
|
348
459
|
'UPDATE jobs SET status=(?), end_at=NULL '
|
|
349
460
|
'WHERE job_id=(?)', (status.value, job_id))
|
|
350
|
-
|
|
461
|
+
_DB.conn.commit()
|
|
351
462
|
|
|
352
463
|
|
|
353
464
|
def set_status(job_id: int, status: JobStatus) -> None:
|
|
@@ -357,16 +468,19 @@ def set_status(job_id: int, status: JobStatus) -> None:
|
|
|
357
468
|
_set_status_no_lock(job_id, status)
|
|
358
469
|
|
|
359
470
|
|
|
471
|
+
@init_db
|
|
360
472
|
def set_job_started(job_id: int) -> None:
|
|
361
473
|
# TODO(mraheja): remove pylint disabling when filelock version updated.
|
|
362
474
|
# pylint: disable=abstract-class-instantiated
|
|
475
|
+
assert _DB is not None
|
|
363
476
|
with filelock.FileLock(_get_lock_path(job_id)):
|
|
364
|
-
|
|
477
|
+
_DB.cursor.execute(
|
|
365
478
|
'UPDATE jobs SET status=(?), start_at=(?), end_at=NULL '
|
|
366
479
|
'WHERE job_id=(?)', (JobStatus.RUNNING.value, time.time(), job_id))
|
|
367
|
-
|
|
480
|
+
_DB.conn.commit()
|
|
368
481
|
|
|
369
482
|
|
|
483
|
+
@init_db
|
|
370
484
|
def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
|
|
371
485
|
"""Get the status of the job with the given id.
|
|
372
486
|
|
|
@@ -375,8 +489,9 @@ def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
|
|
|
375
489
|
the status in a while loop as in `log_lib._follow_job_logs`. Otherwise, use
|
|
376
490
|
`get_status`.
|
|
377
491
|
"""
|
|
378
|
-
|
|
379
|
-
|
|
492
|
+
assert _DB is not None
|
|
493
|
+
rows = _DB.cursor.execute('SELECT status FROM jobs WHERE job_id=(?)',
|
|
494
|
+
(job_id,))
|
|
380
495
|
for (status,) in rows:
|
|
381
496
|
if status is None:
|
|
382
497
|
return None
|
|
@@ -391,17 +506,65 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
391
506
|
return get_status_no_lock(job_id)
|
|
392
507
|
|
|
393
508
|
|
|
509
|
+
@init_db
|
|
394
510
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
511
|
+
return message_utils.encode_payload(get_statuses(job_ids))
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
@init_db
|
|
515
|
+
def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
|
|
516
|
+
assert _DB is not None
|
|
395
517
|
# Per-job lock is not required here, since the staled job status will not
|
|
396
518
|
# affect the caller.
|
|
397
519
|
query_str = ','.join(['?'] * len(job_ids))
|
|
398
|
-
rows =
|
|
520
|
+
rows = _DB.cursor.execute(
|
|
399
521
|
f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
|
|
400
522
|
job_ids)
|
|
401
|
-
statuses = {job_id: None for job_id in job_ids}
|
|
523
|
+
statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
|
|
402
524
|
for (job_id, status) in rows:
|
|
403
525
|
statuses[job_id] = status
|
|
404
|
-
return
|
|
526
|
+
return statuses
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@init_db
|
|
530
|
+
def get_jobs_info(user_hash: Optional[str] = None,
|
|
531
|
+
all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
|
|
532
|
+
"""Get detailed job information.
|
|
533
|
+
|
|
534
|
+
Similar to dump_job_queue but returns structured protobuf objects instead
|
|
535
|
+
of encoded strings.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
user_hash: The user hash to show jobs for. Show all the users if None.
|
|
539
|
+
all_jobs: Whether to show all jobs, not just the pending/running ones.
|
|
540
|
+
"""
|
|
541
|
+
assert _DB is not None
|
|
542
|
+
|
|
543
|
+
status_list: Optional[List[JobStatus]] = [
|
|
544
|
+
JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
|
|
545
|
+
]
|
|
546
|
+
if all_jobs:
|
|
547
|
+
status_list = None
|
|
548
|
+
|
|
549
|
+
jobs = _get_jobs(user_hash, status_list=status_list)
|
|
550
|
+
jobs_info = []
|
|
551
|
+
for job in jobs:
|
|
552
|
+
jobs_info.append(
|
|
553
|
+
jobsv1_pb2.JobInfo(job_id=job['job_id'],
|
|
554
|
+
job_name=job['job_name'],
|
|
555
|
+
username=job['username'],
|
|
556
|
+
submitted_at=job['submitted_at'],
|
|
557
|
+
status=job['status'].to_protobuf(),
|
|
558
|
+
run_timestamp=job['run_timestamp'],
|
|
559
|
+
start_at=job['start_at'],
|
|
560
|
+
end_at=job['end_at'],
|
|
561
|
+
resources=job['resources'],
|
|
562
|
+
pid=job['pid'],
|
|
563
|
+
log_path=os.path.join(
|
|
564
|
+
constants.SKY_LOGS_DIRECTORY,
|
|
565
|
+
job['run_timestamp']),
|
|
566
|
+
metadata=json.dumps(job['metadata'])))
|
|
567
|
+
return jobs_info
|
|
405
568
|
|
|
406
569
|
|
|
407
570
|
def load_statuses_payload(
|
|
@@ -419,14 +582,17 @@ def load_statuses_payload(
|
|
|
419
582
|
return statuses
|
|
420
583
|
|
|
421
584
|
|
|
585
|
+
@init_db
|
|
422
586
|
def get_latest_job_id() -> Optional[int]:
|
|
423
|
-
|
|
587
|
+
assert _DB is not None
|
|
588
|
+
rows = _DB.cursor.execute(
|
|
424
589
|
'SELECT job_id FROM jobs ORDER BY job_id DESC LIMIT 1')
|
|
425
590
|
for (job_id,) in rows:
|
|
426
591
|
return job_id
|
|
427
592
|
return None
|
|
428
593
|
|
|
429
594
|
|
|
595
|
+
@init_db
|
|
430
596
|
def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
431
597
|
get_ended_time: bool) -> str:
|
|
432
598
|
"""Get the job submitted/ended timestamp.
|
|
@@ -437,15 +603,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
437
603
|
PENDING state.
|
|
438
604
|
|
|
439
605
|
The normal job duration will use `start_at` instead of `submitted_at` (in
|
|
440
|
-
`format_job_queue()`), because the job may stay in PENDING if
|
|
441
|
-
busy.
|
|
606
|
+
`table_utils.format_job_queue()`), because the job may stay in PENDING if
|
|
607
|
+
the cluster is busy.
|
|
608
|
+
"""
|
|
609
|
+
return message_utils.encode_payload(
|
|
610
|
+
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
@init_db
|
|
614
|
+
def get_job_submitted_or_ended_timestamp(
|
|
615
|
+
job_id: int, get_ended_time: bool) -> Optional[float]:
|
|
616
|
+
"""Get the job submitted timestamp.
|
|
617
|
+
|
|
618
|
+
Returns the raw timestamp or None if job doesn't exist.
|
|
442
619
|
"""
|
|
620
|
+
assert _DB is not None
|
|
443
621
|
field = 'end_at' if get_ended_time else 'submitted_at'
|
|
444
|
-
rows =
|
|
445
|
-
|
|
622
|
+
rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
|
|
623
|
+
(job_id,))
|
|
446
624
|
for (timestamp,) in rows:
|
|
447
|
-
return
|
|
448
|
-
return
|
|
625
|
+
return timestamp
|
|
626
|
+
return None
|
|
449
627
|
|
|
450
628
|
|
|
451
629
|
def get_ray_port():
|
|
@@ -454,7 +632,8 @@ def get_ray_port():
|
|
|
454
632
|
If the port file does not exist, the cluster was launched before #1790,
|
|
455
633
|
return the default port.
|
|
456
634
|
"""
|
|
457
|
-
port_path =
|
|
635
|
+
port_path = runtime_utils.get_runtime_dir_path(
|
|
636
|
+
constants.SKY_REMOTE_RAY_PORT_FILE)
|
|
458
637
|
if not os.path.exists(port_path):
|
|
459
638
|
return 6379
|
|
460
639
|
port = json.load(open(port_path, 'r', encoding='utf-8'))['ray_port']
|
|
@@ -467,7 +646,8 @@ def get_job_submission_port():
|
|
|
467
646
|
If the port file does not exist, the cluster was launched before #1790,
|
|
468
647
|
return the default port.
|
|
469
648
|
"""
|
|
470
|
-
port_path =
|
|
649
|
+
port_path = runtime_utils.get_runtime_dir_path(
|
|
650
|
+
constants.SKY_REMOTE_RAY_PORT_FILE)
|
|
471
651
|
if not os.path.exists(port_path):
|
|
472
652
|
return 8265
|
|
473
653
|
port = json.load(open(port_path, 'r',
|
|
@@ -492,14 +672,17 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
|
|
|
492
672
|
'end_at': row[JobInfoLoc.END_AT.value],
|
|
493
673
|
'resources': row[JobInfoLoc.RESOURCES.value],
|
|
494
674
|
'pid': row[JobInfoLoc.PID.value],
|
|
675
|
+
'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
|
|
495
676
|
})
|
|
496
677
|
return records
|
|
497
678
|
|
|
498
679
|
|
|
680
|
+
@init_db
|
|
499
681
|
def _get_jobs(
|
|
500
682
|
user_hash: Optional[str],
|
|
501
683
|
status_list: Optional[List[JobStatus]] = None) -> List[Dict[str, Any]]:
|
|
502
684
|
"""Returns jobs with the given fields, sorted by job_id, descending."""
|
|
685
|
+
assert _DB is not None
|
|
503
686
|
if status_list is None:
|
|
504
687
|
status_list = list(JobStatus)
|
|
505
688
|
status_str_list = [repr(status.value) for status in status_list]
|
|
@@ -509,14 +692,16 @@ def _get_jobs(
|
|
|
509
692
|
# We use the old username field for compatibility.
|
|
510
693
|
filter_str += ' AND username=(?)'
|
|
511
694
|
params.append(user_hash)
|
|
512
|
-
rows =
|
|
695
|
+
rows = _DB.cursor.execute(
|
|
513
696
|
f'SELECT * FROM jobs {filter_str} ORDER BY job_id DESC', params)
|
|
514
697
|
records = _get_records_from_rows(rows)
|
|
515
698
|
return records
|
|
516
699
|
|
|
517
700
|
|
|
701
|
+
@init_db
|
|
518
702
|
def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
519
|
-
|
|
703
|
+
assert _DB is not None
|
|
704
|
+
rows = _DB.cursor.execute(
|
|
520
705
|
f"""\
|
|
521
706
|
SELECT * FROM jobs
|
|
522
707
|
WHERE job_id IN ({','.join(['?'] * len(job_ids))})
|
|
@@ -527,8 +712,10 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
|
527
712
|
return records
|
|
528
713
|
|
|
529
714
|
|
|
715
|
+
@init_db
|
|
530
716
|
def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
|
|
531
|
-
|
|
717
|
+
assert _DB is not None
|
|
718
|
+
rows = _DB.cursor.execute(
|
|
532
719
|
'SELECT created_time, submit, run_cmd FROM pending_jobs '
|
|
533
720
|
f'WHERE job_id={job_id!r}')
|
|
534
721
|
for row in rows:
|
|
@@ -578,7 +765,7 @@ def update_job_status(job_ids: List[int],
|
|
|
578
765
|
statuses = []
|
|
579
766
|
for job_id in job_ids:
|
|
580
767
|
# Per-job status lock is required because between the job status
|
|
581
|
-
# query and the job status update, the job status in the
|
|
768
|
+
# query and the job status update, the job status in the database
|
|
582
769
|
# can be modified by the generated ray program.
|
|
583
770
|
with filelock.FileLock(_get_lock_path(job_id)):
|
|
584
771
|
status = None
|
|
@@ -629,12 +816,6 @@ def update_job_status(job_ids: List[int],
|
|
|
629
816
|
'the job state is not in terminal states, setting '
|
|
630
817
|
'it to FAILED_DRIVER')
|
|
631
818
|
status = JobStatus.FAILED_DRIVER
|
|
632
|
-
elif job_pid < 0:
|
|
633
|
-
# TODO(zhwu): Backward compatibility, remove after 0.10.0.
|
|
634
|
-
# We set the job status to PENDING instead of actually
|
|
635
|
-
# checking ray job status and let the status in job table
|
|
636
|
-
# take effect in the later max.
|
|
637
|
-
status = JobStatus.PENDING
|
|
638
819
|
|
|
639
820
|
pending_job = _get_pending_job(job_id)
|
|
640
821
|
if pending_job is not None:
|
|
@@ -698,19 +879,29 @@ def update_job_status(job_ids: List[int],
|
|
|
698
879
|
return statuses
|
|
699
880
|
|
|
700
881
|
|
|
882
|
+
@init_db
|
|
701
883
|
def fail_all_jobs_in_progress() -> None:
|
|
884
|
+
assert _DB is not None
|
|
702
885
|
in_progress_status = [
|
|
703
886
|
status.value for status in JobStatus.nonterminal_statuses()
|
|
704
887
|
]
|
|
705
|
-
|
|
888
|
+
_DB.cursor.execute(
|
|
706
889
|
f"""\
|
|
707
890
|
UPDATE jobs SET status=(?)
|
|
708
891
|
WHERE status IN ({','.join(['?'] * len(in_progress_status))})
|
|
709
892
|
""", (JobStatus.FAILED_DRIVER.value, *in_progress_status))
|
|
710
|
-
|
|
893
|
+
_DB.conn.commit()
|
|
711
894
|
|
|
712
895
|
|
|
713
896
|
def update_status() -> None:
|
|
897
|
+
# This signal file suggests that the controller is recovering from a
|
|
898
|
+
# failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
|
|
899
|
+
# details. When recovering, we should not update the job status to failed
|
|
900
|
+
# driver as they will be recovered later.
|
|
901
|
+
if os.path.exists(
|
|
902
|
+
os.path.expanduser(
|
|
903
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
|
904
|
+
return
|
|
714
905
|
# This will be called periodically by the skylet to update the status
|
|
715
906
|
# of the jobs in the database, to avoid stale job status.
|
|
716
907
|
nonterminal_jobs = _get_jobs(user_hash=None,
|
|
@@ -720,12 +911,14 @@ def update_status() -> None:
|
|
|
720
911
|
update_job_status(nonterminal_job_ids)
|
|
721
912
|
|
|
722
913
|
|
|
914
|
+
@init_db
|
|
723
915
|
def is_cluster_idle() -> bool:
|
|
724
916
|
"""Returns if the cluster is idle (no in-flight jobs)."""
|
|
917
|
+
assert _DB is not None
|
|
725
918
|
in_progress_status = [
|
|
726
919
|
status.value for status in JobStatus.nonterminal_statuses()
|
|
727
920
|
]
|
|
728
|
-
rows =
|
|
921
|
+
rows = _DB.cursor.execute(
|
|
729
922
|
f"""\
|
|
730
923
|
SELECT COUNT(*) FROM jobs
|
|
731
924
|
WHERE status IN ({','.join(['?'] * len(in_progress_status))})
|
|
@@ -735,34 +928,6 @@ def is_cluster_idle() -> bool:
|
|
|
735
928
|
assert False, 'Should not reach here'
|
|
736
929
|
|
|
737
930
|
|
|
738
|
-
def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
739
|
-
"""Format the job queue for display.
|
|
740
|
-
|
|
741
|
-
Usage:
|
|
742
|
-
jobs = get_job_queue()
|
|
743
|
-
print(format_job_queue(jobs))
|
|
744
|
-
"""
|
|
745
|
-
job_table = log_utils.create_table([
|
|
746
|
-
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
747
|
-
'STATUS', 'LOG'
|
|
748
|
-
])
|
|
749
|
-
for job in jobs:
|
|
750
|
-
job_table.add_row([
|
|
751
|
-
job['job_id'],
|
|
752
|
-
job['job_name'],
|
|
753
|
-
job['username'],
|
|
754
|
-
log_utils.readable_time_duration(job['submitted_at']),
|
|
755
|
-
log_utils.readable_time_duration(job['start_at']),
|
|
756
|
-
log_utils.readable_time_duration(job['start_at'],
|
|
757
|
-
job['end_at'],
|
|
758
|
-
absolute=True),
|
|
759
|
-
job['resources'],
|
|
760
|
-
job['status'].colored_str(),
|
|
761
|
-
job['log_path'],
|
|
762
|
-
])
|
|
763
|
-
return job_table
|
|
764
|
-
|
|
765
|
-
|
|
766
931
|
def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
|
|
767
932
|
"""Get the job queue in encoded json format.
|
|
768
933
|
|
|
@@ -794,31 +959,11 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
794
959
|
for job in jobs:
|
|
795
960
|
job['status'] = JobStatus(job['status'])
|
|
796
961
|
job['user_hash'] = job['username']
|
|
797
|
-
|
|
962
|
+
user = global_user_state.get_user(job['user_hash'])
|
|
963
|
+
job['username'] = user.name if user is not None else None
|
|
798
964
|
return jobs
|
|
799
965
|
|
|
800
966
|
|
|
801
|
-
# TODO(zhwu): Backward compatibility for jobs submitted before #4318, remove
|
|
802
|
-
# after 0.10.0.
|
|
803
|
-
def _create_ray_job_submission_client():
|
|
804
|
-
"""Import the ray job submission client."""
|
|
805
|
-
try:
|
|
806
|
-
import ray # pylint: disable=import-outside-toplevel
|
|
807
|
-
except ImportError:
|
|
808
|
-
logger.error('Failed to import ray')
|
|
809
|
-
raise
|
|
810
|
-
try:
|
|
811
|
-
# pylint: disable=import-outside-toplevel
|
|
812
|
-
from ray import job_submission
|
|
813
|
-
except ImportError:
|
|
814
|
-
logger.error(
|
|
815
|
-
f'Failed to import job_submission with ray=={ray.__version__}')
|
|
816
|
-
raise
|
|
817
|
-
port = get_job_submission_port()
|
|
818
|
-
return job_submission.JobSubmissionClient(
|
|
819
|
-
address=f'http://127.0.0.1:{port}')
|
|
820
|
-
|
|
821
|
-
|
|
822
967
|
def _make_ray_job_id(sky_job_id: int) -> str:
|
|
823
968
|
return f'{sky_job_id}-{getpass.getuser()}'
|
|
824
969
|
|
|
@@ -838,6 +983,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
838
983
|
Encoded job IDs that are actually cancelled. Caller should use
|
|
839
984
|
message_utils.decode_payload() to parse.
|
|
840
985
|
"""
|
|
986
|
+
return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
|
|
987
|
+
user_hash))
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def cancel_jobs(jobs: Optional[List[int]],
|
|
991
|
+
cancel_all: bool = False,
|
|
992
|
+
user_hash: Optional[str] = None) -> List[int]:
|
|
841
993
|
job_records = []
|
|
842
994
|
all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
|
|
843
995
|
if jobs is None and not cancel_all:
|
|
@@ -880,18 +1032,6 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
880
1032
|
# We don't have to start a daemon to forcefully kill the process
|
|
881
1033
|
# as our job driver process will clean up the underlying
|
|
882
1034
|
# child processes.
|
|
883
|
-
elif job['pid'] < 0:
|
|
884
|
-
try:
|
|
885
|
-
# TODO(zhwu): Backward compatibility, remove after 0.10.0.
|
|
886
|
-
# The job was submitted with ray job submit before #4318.
|
|
887
|
-
job_client = _create_ray_job_submission_client()
|
|
888
|
-
job_client.stop_job(_make_ray_job_id(job['job_id']))
|
|
889
|
-
except RuntimeError as e:
|
|
890
|
-
# If the request to the job server fails, we should not
|
|
891
|
-
# set the job to CANCELLED.
|
|
892
|
-
if 'does not exist' not in str(e):
|
|
893
|
-
logger.warning(str(e))
|
|
894
|
-
continue
|
|
895
1035
|
# Get the job status again to avoid race condition.
|
|
896
1036
|
job_status = get_status_no_lock(job['job_id'])
|
|
897
1037
|
if job_status in [
|
|
@@ -901,36 +1041,55 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
901
1041
|
cancelled_ids.append(job['job_id'])
|
|
902
1042
|
|
|
903
1043
|
scheduler.schedule_step()
|
|
904
|
-
return
|
|
1044
|
+
return cancelled_ids
|
|
905
1045
|
|
|
906
1046
|
|
|
1047
|
+
@init_db
|
|
907
1048
|
def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
908
1049
|
"""Returns the relative path to the log file for a job."""
|
|
909
|
-
|
|
1050
|
+
assert _DB is not None
|
|
1051
|
+
_DB.cursor.execute(
|
|
910
1052
|
"""\
|
|
911
1053
|
SELECT * FROM jobs
|
|
912
1054
|
WHERE job_id=(?)""", (job_id,))
|
|
913
|
-
row =
|
|
1055
|
+
row = _DB.cursor.fetchone()
|
|
914
1056
|
if row is None:
|
|
915
1057
|
return None
|
|
916
1058
|
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
917
1059
|
return run_timestamp
|
|
918
1060
|
|
|
919
1061
|
|
|
920
|
-
|
|
921
|
-
|
|
1062
|
+
@init_db
|
|
1063
|
+
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
1064
|
+
"""Returns the relative paths to the log files for jobs with globbing,
|
|
1065
|
+
encoded."""
|
|
1066
|
+
job_to_dir = get_job_log_dirs(job_ids)
|
|
1067
|
+
job_to_dir_str: Dict[str, str] = {}
|
|
1068
|
+
for job_id, log_dir in job_to_dir.items():
|
|
1069
|
+
job_to_dir_str[str(job_id)] = log_dir
|
|
1070
|
+
return message_utils.encode_payload(job_to_dir_str)
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
@init_db
|
|
1074
|
+
def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
|
|
1075
|
+
"""Returns the relative paths to the log files for jobs with globbing."""
|
|
1076
|
+
assert _DB is not None
|
|
922
1077
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
|
923
|
-
|
|
1078
|
+
_DB.cursor.execute(
|
|
924
1079
|
f"""\
|
|
925
1080
|
SELECT * FROM jobs
|
|
926
1081
|
WHERE {query_str}""", job_ids)
|
|
927
|
-
rows =
|
|
928
|
-
|
|
1082
|
+
rows = _DB.cursor.fetchall()
|
|
1083
|
+
job_to_dir: Dict[int, str] = {}
|
|
929
1084
|
for row in rows:
|
|
930
1085
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
1086
|
+
if row[JobInfoLoc.LOG_PATH.value]:
|
|
1087
|
+
job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
|
|
1088
|
+
else:
|
|
1089
|
+
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
1090
|
+
job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
1091
|
+
run_timestamp)
|
|
1092
|
+
return job_to_dir
|
|
934
1093
|
|
|
935
1094
|
|
|
936
1095
|
class JobLibCodeGen:
|
|
@@ -951,7 +1110,7 @@ class JobLibCodeGen:
|
|
|
951
1110
|
|
|
952
1111
|
@classmethod
|
|
953
1112
|
def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
|
|
954
|
-
resources_str: str) -> str:
|
|
1113
|
+
resources_str: str, metadata: str) -> str:
|
|
955
1114
|
if job_name is None:
|
|
956
1115
|
job_name = '-'
|
|
957
1116
|
code = [
|
|
@@ -962,12 +1121,25 @@ class JobLibCodeGen:
|
|
|
962
1121
|
'\nif int(constants.SKYLET_VERSION) < 9: '
|
|
963
1122
|
'raise RuntimeError("SkyPilot runtime is too old, which does not '
|
|
964
1123
|
'support submitting jobs.")',
|
|
965
|
-
'\
|
|
1124
|
+
'\nresult = None',
|
|
1125
|
+
'\nif int(constants.SKYLET_VERSION) < 15: '
|
|
1126
|
+
'\n result = job_lib.add_job('
|
|
966
1127
|
f'{job_name!r},'
|
|
967
1128
|
f'{username!r},'
|
|
968
1129
|
f'{run_timestamp!r},'
|
|
969
1130
|
f'{resources_str!r})',
|
|
970
|
-
'
|
|
1131
|
+
'\nelse: '
|
|
1132
|
+
'\n result = job_lib.add_job('
|
|
1133
|
+
f'{job_name!r},'
|
|
1134
|
+
f'{username!r},'
|
|
1135
|
+
f'{run_timestamp!r},'
|
|
1136
|
+
f'{resources_str!r},'
|
|
1137
|
+
f'metadata={metadata!r})',
|
|
1138
|
+
('\nif isinstance(result, tuple):'
|
|
1139
|
+
'\n print("Job ID: " + str(result[0]), flush=True)'
|
|
1140
|
+
'\n print("Log Dir: " + str(result[1]), flush=True)'
|
|
1141
|
+
'\nelse:'
|
|
1142
|
+
'\n print("Job ID: " + str(result), flush=True)'),
|
|
971
1143
|
]
|
|
972
1144
|
return cls._build(code)
|
|
973
1145
|
|
|
@@ -1036,17 +1208,24 @@ class JobLibCodeGen:
|
|
|
1036
1208
|
# We use != instead of is not because 1 is not None will print a warning:
|
|
1037
1209
|
# <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
|
|
1038
1210
|
f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1211
|
+
# For backward compatibility, use the legacy generation rule for
|
|
1212
|
+
# jobs submitted before 0.11.0.
|
|
1213
|
+
('log_dir = None\n'
|
|
1214
|
+
'if hasattr(job_lib, "get_log_dir_for_job"):\n'
|
|
1215
|
+
' log_dir = job_lib.get_log_dir_for_job(job_id)\n'
|
|
1216
|
+
'if log_dir is None:\n'
|
|
1217
|
+
' run_timestamp = job_lib.get_run_timestamp(job_id)\n'
|
|
1218
|
+
f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
|
|
1219
|
+
),
|
|
1220
|
+
# Add a newline to leave the if indent block above.
|
|
1221
|
+
f'\nlog_lib.tail_logs(job_id=job_id, log_dir=log_dir, managed_job_id={managed_job_id!r}, follow={follow}, tail={tail})',
|
|
1044
1222
|
# After tailing, check the job status and exit with appropriate code
|
|
1045
1223
|
'job_status = job_lib.get_status(job_id)',
|
|
1046
|
-
|
|
1047
|
-
#
|
|
1048
|
-
#
|
|
1049
|
-
|
|
1224
|
+
'exit_code = exceptions.JobExitCode.from_job_status(job_status)',
|
|
1225
|
+
# Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
|
|
1226
|
+
# exit with success (0) since fetching current logs is a successful operation.
|
|
1227
|
+
# This prevents shell wrappers from printing "command terminated with exit code 101".
|
|
1228
|
+
f'exit_code = 0 if not {follow} and exit_code == 101 else exit_code',
|
|
1050
1229
|
'sys.exit(exit_code)',
|
|
1051
1230
|
]
|
|
1052
1231
|
return cls._build(code)
|
|
@@ -1078,12 +1257,14 @@ class JobLibCodeGen:
|
|
|
1078
1257
|
return cls._build(code)
|
|
1079
1258
|
|
|
1080
1259
|
@classmethod
|
|
1081
|
-
def
|
|
1082
|
-
job_ids: Optional[List[str]]) -> str:
|
|
1260
|
+
def get_log_dirs_for_jobs(cls, job_ids: Optional[List[str]]) -> str:
|
|
1083
1261
|
code = [
|
|
1084
1262
|
f'job_ids = {job_ids} if {job_ids} is not None '
|
|
1085
1263
|
'else [job_lib.get_latest_job_id()]',
|
|
1086
|
-
|
|
1264
|
+
# TODO(aylei): backward compatibility, remove after 0.12.0.
|
|
1265
|
+
'log_dirs = job_lib.get_log_dir_for_jobs(job_ids) if '
|
|
1266
|
+
'hasattr(job_lib, "get_log_dir_for_jobs") else '
|
|
1267
|
+
'job_lib.run_timestamp_with_globbing_payload(job_ids)',
|
|
1087
1268
|
'print(log_dirs, flush=True)',
|
|
1088
1269
|
]
|
|
1089
1270
|
return cls._build(code)
|