skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
This is a remote utility module that provides job queue functionality.
|
|
4
4
|
"""
|
|
5
5
|
import enum
|
|
6
|
+
import functools
|
|
6
7
|
import getpass
|
|
7
8
|
import json
|
|
8
9
|
import os
|
|
@@ -10,9 +11,10 @@ import pathlib
|
|
|
10
11
|
import shlex
|
|
11
12
|
import signal
|
|
12
13
|
import sqlite3
|
|
14
|
+
import threading
|
|
13
15
|
import time
|
|
14
16
|
import typing
|
|
15
|
-
from typing import Any, Dict, List, Optional, Sequence
|
|
17
|
+
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
16
18
|
|
|
17
19
|
import colorama
|
|
18
20
|
import filelock
|
|
@@ -22,15 +24,17 @@ from sky import sky_logging
|
|
|
22
24
|
from sky.adaptors import common as adaptors_common
|
|
23
25
|
from sky.skylet import constants
|
|
24
26
|
from sky.utils import common_utils
|
|
25
|
-
from sky.utils import db_utils
|
|
26
|
-
from sky.utils import log_utils
|
|
27
27
|
from sky.utils import message_utils
|
|
28
28
|
from sky.utils import subprocess_utils
|
|
29
|
+
from sky.utils.db import db_utils
|
|
29
30
|
|
|
30
31
|
if typing.TYPE_CHECKING:
|
|
31
32
|
import psutil
|
|
33
|
+
|
|
34
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
32
35
|
else:
|
|
33
36
|
psutil = adaptors_common.LazyImport('psutil')
|
|
37
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
34
38
|
|
|
35
39
|
logger = sky_logging.init_logger(__name__)
|
|
36
40
|
|
|
@@ -60,10 +64,8 @@ class JobInfoLoc(enum.IntEnum):
|
|
|
60
64
|
END_AT = 7
|
|
61
65
|
RESOURCES = 8
|
|
62
66
|
PID = 9
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
_DB_PATH = os.path.expanduser('~/.sky/jobs.db')
|
|
66
|
-
os.makedirs(pathlib.Path(_DB_PATH).parents[0], exist_ok=True)
|
|
67
|
+
LOG_PATH = 10
|
|
68
|
+
METADATA = 11
|
|
67
69
|
|
|
68
70
|
|
|
69
71
|
def create_table(cursor, conn):
|
|
@@ -103,7 +105,9 @@ def create_table(cursor, conn):
|
|
|
103
105
|
start_at FLOAT DEFAULT -1,
|
|
104
106
|
end_at FLOAT DEFAULT NULL,
|
|
105
107
|
resources TEXT DEFAULT NULL,
|
|
106
|
-
pid INTEGER DEFAULT -1
|
|
108
|
+
pid INTEGER DEFAULT -1,
|
|
109
|
+
log_dir TEXT DEFAULT NULL,
|
|
110
|
+
metadata TEXT DEFAULT '{}')""")
|
|
107
111
|
|
|
108
112
|
cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs(
|
|
109
113
|
job_id INTEGER,
|
|
@@ -116,12 +120,38 @@ def create_table(cursor, conn):
|
|
|
116
120
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT')
|
|
117
121
|
db_utils.add_column_to_table(cursor, conn, 'jobs', 'pid',
|
|
118
122
|
'INTEGER DEFAULT -1')
|
|
123
|
+
db_utils.add_column_to_table(cursor, conn, 'jobs', 'log_dir',
|
|
124
|
+
'TEXT DEFAULT NULL')
|
|
125
|
+
db_utils.add_column_to_table(cursor,
|
|
126
|
+
conn,
|
|
127
|
+
'jobs',
|
|
128
|
+
'metadata',
|
|
129
|
+
'TEXT DEFAULT \'{}\'',
|
|
130
|
+
value_to_replace_existing_entries='{}')
|
|
119
131
|
conn.commit()
|
|
120
132
|
|
|
121
133
|
|
|
122
|
-
_DB =
|
|
123
|
-
|
|
124
|
-
|
|
134
|
+
_DB = None
|
|
135
|
+
_db_init_lock = threading.Lock()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def init_db(func):
|
|
139
|
+
"""Initialize the database."""
|
|
140
|
+
|
|
141
|
+
@functools.wraps(func)
|
|
142
|
+
def wrapper(*args, **kwargs):
|
|
143
|
+
global _DB
|
|
144
|
+
if _DB is not None:
|
|
145
|
+
return func(*args, **kwargs)
|
|
146
|
+
|
|
147
|
+
with _db_init_lock:
|
|
148
|
+
if _DB is None:
|
|
149
|
+
db_path = os.path.expanduser('~/.sky/jobs.db')
|
|
150
|
+
os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
|
|
151
|
+
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
152
|
+
return func(*args, **kwargs)
|
|
153
|
+
|
|
154
|
+
return wrapper
|
|
125
155
|
|
|
126
156
|
|
|
127
157
|
class JobStatus(enum.Enum):
|
|
@@ -192,6 +222,45 @@ class JobStatus(enum.Enum):
|
|
|
192
222
|
color = _JOB_STATUS_TO_COLOR[self]
|
|
193
223
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
194
224
|
|
|
225
|
+
@classmethod
|
|
226
|
+
def from_protobuf(
|
|
227
|
+
cls,
|
|
228
|
+
protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
|
|
229
|
+
"""Convert protobuf JobStatus enum to Python enum value."""
|
|
230
|
+
protobuf_to_enum = {
|
|
231
|
+
jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
|
|
232
|
+
jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
|
|
233
|
+
jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
|
|
234
|
+
jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
|
|
235
|
+
jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
|
|
236
|
+
jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
237
|
+
jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
|
|
238
|
+
jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
|
|
239
|
+
jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
240
|
+
jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
|
|
241
|
+
}
|
|
242
|
+
if protobuf_value not in protobuf_to_enum:
|
|
243
|
+
raise ValueError(
|
|
244
|
+
f'Unknown protobuf JobStatus value: {protobuf_value}')
|
|
245
|
+
return protobuf_to_enum[protobuf_value]
|
|
246
|
+
|
|
247
|
+
def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
|
|
248
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
249
|
+
enum_to_protobuf = {
|
|
250
|
+
JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
|
|
251
|
+
JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
|
|
252
|
+
JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
|
|
253
|
+
JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
|
|
254
|
+
JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
|
|
255
|
+
JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
|
|
256
|
+
JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
|
|
257
|
+
JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
|
|
258
|
+
JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
|
|
259
|
+
}
|
|
260
|
+
if self not in enum_to_protobuf:
|
|
261
|
+
raise ValueError(f'Unknown JobStatus value: {self}')
|
|
262
|
+
return enum_to_protobuf[self]
|
|
263
|
+
|
|
195
264
|
|
|
196
265
|
# We have two steps for job submissions:
|
|
197
266
|
# 1. Client reserve a job id from the job table by adding a INIT state job.
|
|
@@ -210,30 +279,37 @@ _PRE_RESOURCE_STATUSES = [JobStatus.PENDING]
|
|
|
210
279
|
class JobScheduler:
|
|
211
280
|
"""Base class for job scheduler"""
|
|
212
281
|
|
|
282
|
+
@init_db
|
|
213
283
|
def queue(self, job_id: int, cmd: str) -> None:
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
284
|
+
assert _DB is not None
|
|
285
|
+
_DB.cursor.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)',
|
|
286
|
+
(job_id, cmd, 0, int(time.time())))
|
|
287
|
+
_DB.conn.commit()
|
|
217
288
|
set_status(job_id, JobStatus.PENDING)
|
|
218
289
|
self.schedule_step()
|
|
219
290
|
|
|
291
|
+
@init_db
|
|
220
292
|
def remove_job_no_lock(self, job_id: int) -> None:
|
|
221
|
-
|
|
222
|
-
|
|
293
|
+
assert _DB is not None
|
|
294
|
+
_DB.cursor.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}')
|
|
295
|
+
_DB.conn.commit()
|
|
223
296
|
|
|
297
|
+
@init_db
|
|
224
298
|
def _run_job(self, job_id: int, run_cmd: str):
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
299
|
+
assert _DB is not None
|
|
300
|
+
_DB.cursor.execute(
|
|
301
|
+
(f'UPDATE pending_jobs SET submit={int(time.time())} '
|
|
302
|
+
f'WHERE job_id={job_id!r}'))
|
|
303
|
+
_DB.conn.commit()
|
|
228
304
|
pid = subprocess_utils.launch_new_process_tree(run_cmd)
|
|
229
305
|
# TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
|
|
230
306
|
# This is for the case where the job is submitted with SkyPilot older
|
|
231
307
|
# than #4318, using ray job submit.
|
|
232
308
|
if 'job submit' in run_cmd:
|
|
233
309
|
pid = -1
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
310
|
+
_DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
|
|
311
|
+
f'WHERE job_id={job_id!r}'))
|
|
312
|
+
_DB.conn.commit()
|
|
237
313
|
|
|
238
314
|
def schedule_step(self, force_update_jobs: bool = False) -> None:
|
|
239
315
|
if force_update_jobs:
|
|
@@ -282,8 +358,10 @@ class JobScheduler:
|
|
|
282
358
|
class FIFOScheduler(JobScheduler):
|
|
283
359
|
"""First in first out job scheduler"""
|
|
284
360
|
|
|
361
|
+
@init_db
|
|
285
362
|
def _get_pending_job_ids(self) -> List[int]:
|
|
286
|
-
|
|
363
|
+
assert _DB is not None
|
|
364
|
+
rows = _DB.cursor.execute(
|
|
287
365
|
'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
|
|
288
366
|
return [row[0] for row in rows]
|
|
289
367
|
|
|
@@ -308,26 +386,67 @@ def make_job_command_with_user_switching(username: str,
|
|
|
308
386
|
return ['sudo', '-H', 'su', '--login', username, '-c', command]
|
|
309
387
|
|
|
310
388
|
|
|
311
|
-
|
|
312
|
-
|
|
389
|
+
@init_db
|
|
390
|
+
def add_job(job_name: str,
|
|
391
|
+
username: str,
|
|
392
|
+
run_timestamp: str,
|
|
393
|
+
resources_str: str,
|
|
394
|
+
metadata: str = '{}') -> Tuple[int, str]:
|
|
313
395
|
"""Atomically reserve the next available job id for the user."""
|
|
396
|
+
assert _DB is not None
|
|
314
397
|
job_submitted_at = time.time()
|
|
315
398
|
# job_id will autoincrement with the null value
|
|
316
|
-
|
|
317
|
-
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0)',
|
|
399
|
+
_DB.cursor.execute(
|
|
400
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)',
|
|
318
401
|
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
|
319
|
-
run_timestamp, None, resources_str))
|
|
320
|
-
|
|
321
|
-
rows =
|
|
322
|
-
|
|
402
|
+
run_timestamp, None, resources_str, metadata))
|
|
403
|
+
_DB.conn.commit()
|
|
404
|
+
rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
|
|
405
|
+
(run_timestamp,))
|
|
323
406
|
for row in rows:
|
|
324
407
|
job_id = row[0]
|
|
325
408
|
assert job_id is not None
|
|
326
|
-
|
|
409
|
+
log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, f'{job_id}-{job_name}')
|
|
410
|
+
set_log_dir_no_lock(job_id, log_dir)
|
|
411
|
+
return job_id, log_dir
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@init_db
|
|
415
|
+
def set_log_dir_no_lock(job_id: int, log_dir: str) -> None:
|
|
416
|
+
"""Set the log directory for the job.
|
|
417
|
+
|
|
418
|
+
We persist the log directory for the job to allow changing the log directory
|
|
419
|
+
generation logic over versions.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
job_id: The ID of the job.
|
|
423
|
+
log_dir: The log directory for the job.
|
|
424
|
+
"""
|
|
425
|
+
assert _DB is not None
|
|
426
|
+
_DB.cursor.execute('UPDATE jobs SET log_dir=(?) WHERE job_id=(?)',
|
|
427
|
+
(log_dir, job_id))
|
|
428
|
+
_DB.conn.commit()
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
@init_db
|
|
432
|
+
def get_log_dir_for_job(job_id: int) -> Optional[str]:
|
|
433
|
+
"""Get the log directory for the job.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
job_id: The ID of the job.
|
|
437
|
+
"""
|
|
438
|
+
assert _DB is not None
|
|
439
|
+
rows = _DB.cursor.execute('SELECT log_dir FROM jobs WHERE job_id=(?)',
|
|
440
|
+
(job_id,))
|
|
441
|
+
for row in rows:
|
|
442
|
+
return row[0]
|
|
443
|
+
return None
|
|
327
444
|
|
|
328
445
|
|
|
446
|
+
@init_db
|
|
329
447
|
def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
|
|
330
448
|
"""Setting the status of the job in the database."""
|
|
449
|
+
assert _DB is not None
|
|
331
450
|
assert status != JobStatus.RUNNING, (
|
|
332
451
|
'Please use set_job_started() to set job status to RUNNING')
|
|
333
452
|
if status.is_terminal():
|
|
@@ -339,15 +458,15 @@ def _set_status_no_lock(job_id: int, status: JobStatus) -> None:
|
|
|
339
458
|
check_end_at_str = ' AND end_at IS NULL'
|
|
340
459
|
if status != JobStatus.FAILED_SETUP:
|
|
341
460
|
check_end_at_str = ''
|
|
342
|
-
|
|
461
|
+
_DB.cursor.execute(
|
|
343
462
|
'UPDATE jobs SET status=(?), end_at=(?) '
|
|
344
463
|
f'WHERE job_id=(?) {check_end_at_str}',
|
|
345
464
|
(status.value, end_at, job_id))
|
|
346
465
|
else:
|
|
347
|
-
|
|
466
|
+
_DB.cursor.execute(
|
|
348
467
|
'UPDATE jobs SET status=(?), end_at=NULL '
|
|
349
468
|
'WHERE job_id=(?)', (status.value, job_id))
|
|
350
|
-
|
|
469
|
+
_DB.conn.commit()
|
|
351
470
|
|
|
352
471
|
|
|
353
472
|
def set_status(job_id: int, status: JobStatus) -> None:
|
|
@@ -357,16 +476,19 @@ def set_status(job_id: int, status: JobStatus) -> None:
|
|
|
357
476
|
_set_status_no_lock(job_id, status)
|
|
358
477
|
|
|
359
478
|
|
|
479
|
+
@init_db
|
|
360
480
|
def set_job_started(job_id: int) -> None:
|
|
361
481
|
# TODO(mraheja): remove pylint disabling when filelock version updated.
|
|
362
482
|
# pylint: disable=abstract-class-instantiated
|
|
483
|
+
assert _DB is not None
|
|
363
484
|
with filelock.FileLock(_get_lock_path(job_id)):
|
|
364
|
-
|
|
485
|
+
_DB.cursor.execute(
|
|
365
486
|
'UPDATE jobs SET status=(?), start_at=(?), end_at=NULL '
|
|
366
487
|
'WHERE job_id=(?)', (JobStatus.RUNNING.value, time.time(), job_id))
|
|
367
|
-
|
|
488
|
+
_DB.conn.commit()
|
|
368
489
|
|
|
369
490
|
|
|
491
|
+
@init_db
|
|
370
492
|
def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
|
|
371
493
|
"""Get the status of the job with the given id.
|
|
372
494
|
|
|
@@ -375,8 +497,9 @@ def get_status_no_lock(job_id: int) -> Optional[JobStatus]:
|
|
|
375
497
|
the status in a while loop as in `log_lib._follow_job_logs`. Otherwise, use
|
|
376
498
|
`get_status`.
|
|
377
499
|
"""
|
|
378
|
-
|
|
379
|
-
|
|
500
|
+
assert _DB is not None
|
|
501
|
+
rows = _DB.cursor.execute('SELECT status FROM jobs WHERE job_id=(?)',
|
|
502
|
+
(job_id,))
|
|
380
503
|
for (status,) in rows:
|
|
381
504
|
if status is None:
|
|
382
505
|
return None
|
|
@@ -391,17 +514,65 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
391
514
|
return get_status_no_lock(job_id)
|
|
392
515
|
|
|
393
516
|
|
|
517
|
+
@init_db
|
|
394
518
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
519
|
+
return message_utils.encode_payload(get_statuses(job_ids))
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
@init_db
|
|
523
|
+
def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
|
|
524
|
+
assert _DB is not None
|
|
395
525
|
# Per-job lock is not required here, since the staled job status will not
|
|
396
526
|
# affect the caller.
|
|
397
527
|
query_str = ','.join(['?'] * len(job_ids))
|
|
398
|
-
rows =
|
|
528
|
+
rows = _DB.cursor.execute(
|
|
399
529
|
f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
|
|
400
530
|
job_ids)
|
|
401
|
-
statuses = {job_id: None for job_id in job_ids}
|
|
531
|
+
statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
|
|
402
532
|
for (job_id, status) in rows:
|
|
403
533
|
statuses[job_id] = status
|
|
404
|
-
return
|
|
534
|
+
return statuses
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
@init_db
|
|
538
|
+
def get_jobs_info(user_hash: Optional[str] = None,
|
|
539
|
+
all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
|
|
540
|
+
"""Get detailed job information.
|
|
541
|
+
|
|
542
|
+
Similar to dump_job_queue but returns structured protobuf objects instead
|
|
543
|
+
of encoded strings.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
user_hash: The user hash to show jobs for. Show all the users if None.
|
|
547
|
+
all_jobs: Whether to show all jobs, not just the pending/running ones.
|
|
548
|
+
"""
|
|
549
|
+
assert _DB is not None
|
|
550
|
+
|
|
551
|
+
status_list: Optional[List[JobStatus]] = [
|
|
552
|
+
JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
|
|
553
|
+
]
|
|
554
|
+
if all_jobs:
|
|
555
|
+
status_list = None
|
|
556
|
+
|
|
557
|
+
jobs = _get_jobs(user_hash, status_list=status_list)
|
|
558
|
+
jobs_info = []
|
|
559
|
+
for job in jobs:
|
|
560
|
+
jobs_info.append(
|
|
561
|
+
jobsv1_pb2.JobInfo(job_id=job['job_id'],
|
|
562
|
+
job_name=job['job_name'],
|
|
563
|
+
username=job['username'],
|
|
564
|
+
submitted_at=job['submitted_at'],
|
|
565
|
+
status=job['status'].to_protobuf(),
|
|
566
|
+
run_timestamp=job['run_timestamp'],
|
|
567
|
+
start_at=job['start_at'],
|
|
568
|
+
end_at=job['end_at'],
|
|
569
|
+
resources=job['resources'],
|
|
570
|
+
pid=job['pid'],
|
|
571
|
+
log_path=os.path.join(
|
|
572
|
+
constants.SKY_LOGS_DIRECTORY,
|
|
573
|
+
job['run_timestamp']),
|
|
574
|
+
metadata=json.dumps(job['metadata'])))
|
|
575
|
+
return jobs_info
|
|
405
576
|
|
|
406
577
|
|
|
407
578
|
def load_statuses_payload(
|
|
@@ -419,14 +590,17 @@ def load_statuses_payload(
|
|
|
419
590
|
return statuses
|
|
420
591
|
|
|
421
592
|
|
|
593
|
+
@init_db
|
|
422
594
|
def get_latest_job_id() -> Optional[int]:
|
|
423
|
-
|
|
595
|
+
assert _DB is not None
|
|
596
|
+
rows = _DB.cursor.execute(
|
|
424
597
|
'SELECT job_id FROM jobs ORDER BY job_id DESC LIMIT 1')
|
|
425
598
|
for (job_id,) in rows:
|
|
426
599
|
return job_id
|
|
427
600
|
return None
|
|
428
601
|
|
|
429
602
|
|
|
603
|
+
@init_db
|
|
430
604
|
def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
431
605
|
get_ended_time: bool) -> str:
|
|
432
606
|
"""Get the job submitted/ended timestamp.
|
|
@@ -437,15 +611,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
437
611
|
PENDING state.
|
|
438
612
|
|
|
439
613
|
The normal job duration will use `start_at` instead of `submitted_at` (in
|
|
440
|
-
`format_job_queue()`), because the job may stay in PENDING if
|
|
441
|
-
busy.
|
|
614
|
+
`table_utils.format_job_queue()`), because the job may stay in PENDING if
|
|
615
|
+
the cluster is busy.
|
|
616
|
+
"""
|
|
617
|
+
return message_utils.encode_payload(
|
|
618
|
+
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
@init_db
|
|
622
|
+
def get_job_submitted_or_ended_timestamp(
|
|
623
|
+
job_id: int, get_ended_time: bool) -> Optional[float]:
|
|
624
|
+
"""Get the job submitted timestamp.
|
|
625
|
+
|
|
626
|
+
Returns the raw timestamp or None if job doesn't exist.
|
|
442
627
|
"""
|
|
628
|
+
assert _DB is not None
|
|
443
629
|
field = 'end_at' if get_ended_time else 'submitted_at'
|
|
444
|
-
rows =
|
|
445
|
-
|
|
630
|
+
rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
|
|
631
|
+
(job_id,))
|
|
446
632
|
for (timestamp,) in rows:
|
|
447
|
-
return
|
|
448
|
-
return
|
|
633
|
+
return timestamp
|
|
634
|
+
return None
|
|
449
635
|
|
|
450
636
|
|
|
451
637
|
def get_ray_port():
|
|
@@ -492,14 +678,17 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
|
|
|
492
678
|
'end_at': row[JobInfoLoc.END_AT.value],
|
|
493
679
|
'resources': row[JobInfoLoc.RESOURCES.value],
|
|
494
680
|
'pid': row[JobInfoLoc.PID.value],
|
|
681
|
+
'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
|
|
495
682
|
})
|
|
496
683
|
return records
|
|
497
684
|
|
|
498
685
|
|
|
686
|
+
@init_db
|
|
499
687
|
def _get_jobs(
|
|
500
688
|
user_hash: Optional[str],
|
|
501
689
|
status_list: Optional[List[JobStatus]] = None) -> List[Dict[str, Any]]:
|
|
502
690
|
"""Returns jobs with the given fields, sorted by job_id, descending."""
|
|
691
|
+
assert _DB is not None
|
|
503
692
|
if status_list is None:
|
|
504
693
|
status_list = list(JobStatus)
|
|
505
694
|
status_str_list = [repr(status.value) for status in status_list]
|
|
@@ -509,14 +698,16 @@ def _get_jobs(
|
|
|
509
698
|
# We use the old username field for compatibility.
|
|
510
699
|
filter_str += ' AND username=(?)'
|
|
511
700
|
params.append(user_hash)
|
|
512
|
-
rows =
|
|
701
|
+
rows = _DB.cursor.execute(
|
|
513
702
|
f'SELECT * FROM jobs {filter_str} ORDER BY job_id DESC', params)
|
|
514
703
|
records = _get_records_from_rows(rows)
|
|
515
704
|
return records
|
|
516
705
|
|
|
517
706
|
|
|
707
|
+
@init_db
|
|
518
708
|
def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
519
|
-
|
|
709
|
+
assert _DB is not None
|
|
710
|
+
rows = _DB.cursor.execute(
|
|
520
711
|
f"""\
|
|
521
712
|
SELECT * FROM jobs
|
|
522
713
|
WHERE job_id IN ({','.join(['?'] * len(job_ids))})
|
|
@@ -527,8 +718,10 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
|
527
718
|
return records
|
|
528
719
|
|
|
529
720
|
|
|
721
|
+
@init_db
|
|
530
722
|
def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
|
|
531
|
-
|
|
723
|
+
assert _DB is not None
|
|
724
|
+
rows = _DB.cursor.execute(
|
|
532
725
|
'SELECT created_time, submit, run_cmd FROM pending_jobs '
|
|
533
726
|
f'WHERE job_id={job_id!r}')
|
|
534
727
|
for row in rows:
|
|
@@ -698,19 +891,29 @@ def update_job_status(job_ids: List[int],
|
|
|
698
891
|
return statuses
|
|
699
892
|
|
|
700
893
|
|
|
894
|
+
@init_db
|
|
701
895
|
def fail_all_jobs_in_progress() -> None:
|
|
896
|
+
assert _DB is not None
|
|
702
897
|
in_progress_status = [
|
|
703
898
|
status.value for status in JobStatus.nonterminal_statuses()
|
|
704
899
|
]
|
|
705
|
-
|
|
900
|
+
_DB.cursor.execute(
|
|
706
901
|
f"""\
|
|
707
902
|
UPDATE jobs SET status=(?)
|
|
708
903
|
WHERE status IN ({','.join(['?'] * len(in_progress_status))})
|
|
709
904
|
""", (JobStatus.FAILED_DRIVER.value, *in_progress_status))
|
|
710
|
-
|
|
905
|
+
_DB.conn.commit()
|
|
711
906
|
|
|
712
907
|
|
|
713
908
|
def update_status() -> None:
|
|
909
|
+
# This signal file suggests that the controller is recovering from a
|
|
910
|
+
# failure. See sky/jobs/utils.py::update_managed_jobs_statuses for more
|
|
911
|
+
# details. When recovering, we should not update the job status to failed
|
|
912
|
+
# driver as they will be recovered later.
|
|
913
|
+
if os.path.exists(
|
|
914
|
+
os.path.expanduser(
|
|
915
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
|
916
|
+
return
|
|
714
917
|
# This will be called periodically by the skylet to update the status
|
|
715
918
|
# of the jobs in the database, to avoid stale job status.
|
|
716
919
|
nonterminal_jobs = _get_jobs(user_hash=None,
|
|
@@ -720,12 +923,14 @@ def update_status() -> None:
|
|
|
720
923
|
update_job_status(nonterminal_job_ids)
|
|
721
924
|
|
|
722
925
|
|
|
926
|
+
@init_db
|
|
723
927
|
def is_cluster_idle() -> bool:
|
|
724
928
|
"""Returns if the cluster is idle (no in-flight jobs)."""
|
|
929
|
+
assert _DB is not None
|
|
725
930
|
in_progress_status = [
|
|
726
931
|
status.value for status in JobStatus.nonterminal_statuses()
|
|
727
932
|
]
|
|
728
|
-
rows =
|
|
933
|
+
rows = _DB.cursor.execute(
|
|
729
934
|
f"""\
|
|
730
935
|
SELECT COUNT(*) FROM jobs
|
|
731
936
|
WHERE status IN ({','.join(['?'] * len(in_progress_status))})
|
|
@@ -735,34 +940,6 @@ def is_cluster_idle() -> bool:
|
|
|
735
940
|
assert False, 'Should not reach here'
|
|
736
941
|
|
|
737
942
|
|
|
738
|
-
def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
739
|
-
"""Format the job queue for display.
|
|
740
|
-
|
|
741
|
-
Usage:
|
|
742
|
-
jobs = get_job_queue()
|
|
743
|
-
print(format_job_queue(jobs))
|
|
744
|
-
"""
|
|
745
|
-
job_table = log_utils.create_table([
|
|
746
|
-
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
747
|
-
'STATUS', 'LOG'
|
|
748
|
-
])
|
|
749
|
-
for job in jobs:
|
|
750
|
-
job_table.add_row([
|
|
751
|
-
job['job_id'],
|
|
752
|
-
job['job_name'],
|
|
753
|
-
job['username'],
|
|
754
|
-
log_utils.readable_time_duration(job['submitted_at']),
|
|
755
|
-
log_utils.readable_time_duration(job['start_at']),
|
|
756
|
-
log_utils.readable_time_duration(job['start_at'],
|
|
757
|
-
job['end_at'],
|
|
758
|
-
absolute=True),
|
|
759
|
-
job['resources'],
|
|
760
|
-
job['status'].colored_str(),
|
|
761
|
-
job['log_path'],
|
|
762
|
-
])
|
|
763
|
-
return job_table
|
|
764
|
-
|
|
765
|
-
|
|
766
943
|
def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
|
|
767
944
|
"""Get the job queue in encoded json format.
|
|
768
945
|
|
|
@@ -794,7 +971,8 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
794
971
|
for job in jobs:
|
|
795
972
|
job['status'] = JobStatus(job['status'])
|
|
796
973
|
job['user_hash'] = job['username']
|
|
797
|
-
|
|
974
|
+
user = global_user_state.get_user(job['user_hash'])
|
|
975
|
+
job['username'] = user.name if user is not None else None
|
|
798
976
|
return jobs
|
|
799
977
|
|
|
800
978
|
|
|
@@ -838,6 +1016,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
838
1016
|
Encoded job IDs that are actually cancelled. Caller should use
|
|
839
1017
|
message_utils.decode_payload() to parse.
|
|
840
1018
|
"""
|
|
1019
|
+
return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
|
|
1020
|
+
user_hash))
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
def cancel_jobs(jobs: Optional[List[int]],
|
|
1024
|
+
cancel_all: bool = False,
|
|
1025
|
+
user_hash: Optional[str] = None) -> List[int]:
|
|
841
1026
|
job_records = []
|
|
842
1027
|
all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
|
|
843
1028
|
if jobs is None and not cancel_all:
|
|
@@ -901,36 +1086,55 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
901
1086
|
cancelled_ids.append(job['job_id'])
|
|
902
1087
|
|
|
903
1088
|
scheduler.schedule_step()
|
|
904
|
-
return
|
|
1089
|
+
return cancelled_ids
|
|
905
1090
|
|
|
906
1091
|
|
|
1092
|
+
@init_db
|
|
907
1093
|
def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
908
1094
|
"""Returns the relative path to the log file for a job."""
|
|
909
|
-
|
|
1095
|
+
assert _DB is not None
|
|
1096
|
+
_DB.cursor.execute(
|
|
910
1097
|
"""\
|
|
911
1098
|
SELECT * FROM jobs
|
|
912
1099
|
WHERE job_id=(?)""", (job_id,))
|
|
913
|
-
row =
|
|
1100
|
+
row = _DB.cursor.fetchone()
|
|
914
1101
|
if row is None:
|
|
915
1102
|
return None
|
|
916
1103
|
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
917
1104
|
return run_timestamp
|
|
918
1105
|
|
|
919
1106
|
|
|
920
|
-
|
|
921
|
-
|
|
1107
|
+
@init_db
|
|
1108
|
+
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
1109
|
+
"""Returns the relative paths to the log files for jobs with globbing,
|
|
1110
|
+
encoded."""
|
|
1111
|
+
job_to_dir = get_job_log_dirs(job_ids)
|
|
1112
|
+
job_to_dir_str: Dict[str, str] = {}
|
|
1113
|
+
for job_id, log_dir in job_to_dir.items():
|
|
1114
|
+
job_to_dir_str[str(job_id)] = log_dir
|
|
1115
|
+
return message_utils.encode_payload(job_to_dir_str)
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
@init_db
|
|
1119
|
+
def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
|
|
1120
|
+
"""Returns the relative paths to the log files for jobs with globbing."""
|
|
1121
|
+
assert _DB is not None
|
|
922
1122
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
|
923
|
-
|
|
1123
|
+
_DB.cursor.execute(
|
|
924
1124
|
f"""\
|
|
925
1125
|
SELECT * FROM jobs
|
|
926
1126
|
WHERE {query_str}""", job_ids)
|
|
927
|
-
rows =
|
|
928
|
-
|
|
1127
|
+
rows = _DB.cursor.fetchall()
|
|
1128
|
+
job_to_dir: Dict[int, str] = {}
|
|
929
1129
|
for row in rows:
|
|
930
1130
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
1131
|
+
if row[JobInfoLoc.LOG_PATH.value]:
|
|
1132
|
+
job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
|
|
1133
|
+
else:
|
|
1134
|
+
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
1135
|
+
job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
1136
|
+
run_timestamp)
|
|
1137
|
+
return job_to_dir
|
|
934
1138
|
|
|
935
1139
|
|
|
936
1140
|
class JobLibCodeGen:
|
|
@@ -951,7 +1155,7 @@ class JobLibCodeGen:
|
|
|
951
1155
|
|
|
952
1156
|
@classmethod
|
|
953
1157
|
def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
|
|
954
|
-
resources_str: str) -> str:
|
|
1158
|
+
resources_str: str, metadata: str) -> str:
|
|
955
1159
|
if job_name is None:
|
|
956
1160
|
job_name = '-'
|
|
957
1161
|
code = [
|
|
@@ -962,12 +1166,25 @@ class JobLibCodeGen:
|
|
|
962
1166
|
'\nif int(constants.SKYLET_VERSION) < 9: '
|
|
963
1167
|
'raise RuntimeError("SkyPilot runtime is too old, which does not '
|
|
964
1168
|
'support submitting jobs.")',
|
|
965
|
-
'\
|
|
1169
|
+
'\nresult = None',
|
|
1170
|
+
'\nif int(constants.SKYLET_VERSION) < 15: '
|
|
1171
|
+
'\n result = job_lib.add_job('
|
|
966
1172
|
f'{job_name!r},'
|
|
967
1173
|
f'{username!r},'
|
|
968
1174
|
f'{run_timestamp!r},'
|
|
969
1175
|
f'{resources_str!r})',
|
|
970
|
-
'
|
|
1176
|
+
'\nelse: '
|
|
1177
|
+
'\n result = job_lib.add_job('
|
|
1178
|
+
f'{job_name!r},'
|
|
1179
|
+
f'{username!r},'
|
|
1180
|
+
f'{run_timestamp!r},'
|
|
1181
|
+
f'{resources_str!r},'
|
|
1182
|
+
f'metadata={metadata!r})',
|
|
1183
|
+
('\nif isinstance(result, tuple):'
|
|
1184
|
+
'\n print("Job ID: " + str(result[0]), flush=True)'
|
|
1185
|
+
'\n print("Log Dir: " + str(result[1]), flush=True)'
|
|
1186
|
+
'\nelse:'
|
|
1187
|
+
'\n print("Job ID: " + str(result), flush=True)'),
|
|
971
1188
|
]
|
|
972
1189
|
return cls._build(code)
|
|
973
1190
|
|
|
@@ -1036,9 +1253,17 @@ class JobLibCodeGen:
|
|
|
1036
1253
|
# We use != instead of is not because 1 is not None will print a warning:
|
|
1037
1254
|
# <stdin>:1: SyntaxWarning: "is not" with a literal. Did you mean "!="?
|
|
1038
1255
|
f'job_id = {job_id} if {job_id} != None else job_lib.get_latest_job_id()',
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1256
|
+
# For backward compatibility, use the legacy generation rule for
|
|
1257
|
+
# jobs submitted before 0.11.0.
|
|
1258
|
+
('log_dir = None\n'
|
|
1259
|
+
'if hasattr(job_lib, "get_log_dir_for_job"):\n'
|
|
1260
|
+
' log_dir = job_lib.get_log_dir_for_job(job_id)\n'
|
|
1261
|
+
'if log_dir is None:\n'
|
|
1262
|
+
' run_timestamp = job_lib.get_run_timestamp(job_id)\n'
|
|
1263
|
+
f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
|
|
1264
|
+
),
|
|
1265
|
+
# Add a newline to leave the if indent block above.
|
|
1266
|
+
f'\ntail_log_kwargs = {{"job_id": job_id, "log_dir": log_dir, "managed_job_id": {managed_job_id!r}, "follow": {follow}}}',
|
|
1042
1267
|
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
|
1043
1268
|
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
|
1044
1269
|
# After tailing, check the job status and exit with appropriate code
|
|
@@ -1047,6 +1272,10 @@ class JobLibCodeGen:
|
|
|
1047
1272
|
# and older did not have JobExitCode, so we use 0 for those versions
|
|
1048
1273
|
# TODO: Remove this special handling after 0.10.0.
|
|
1049
1274
|
'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
|
|
1275
|
+
# Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
|
|
1276
|
+
# exit with success (0) since fetching current logs is a successful operation.
|
|
1277
|
+
# This prevents shell wrappers from printing "command terminated with exit code 101".
|
|
1278
|
+
f'exit_code = 0 if not {follow} and exit_code == 101 else exit_code',
|
|
1050
1279
|
'sys.exit(exit_code)',
|
|
1051
1280
|
]
|
|
1052
1281
|
return cls._build(code)
|
|
@@ -1078,12 +1307,14 @@ class JobLibCodeGen:
|
|
|
1078
1307
|
return cls._build(code)
|
|
1079
1308
|
|
|
1080
1309
|
@classmethod
|
|
1081
|
-
def
|
|
1082
|
-
job_ids: Optional[List[str]]) -> str:
|
|
1310
|
+
def get_log_dirs_for_jobs(cls, job_ids: Optional[List[str]]) -> str:
|
|
1083
1311
|
code = [
|
|
1084
1312
|
f'job_ids = {job_ids} if {job_ids} is not None '
|
|
1085
1313
|
'else [job_lib.get_latest_job_id()]',
|
|
1086
|
-
|
|
1314
|
+
# TODO(aylei): backward compatibility, remove after 0.12.0.
|
|
1315
|
+
'log_dirs = job_lib.get_log_dir_for_jobs(job_ids) if '
|
|
1316
|
+
'hasattr(job_lib, "get_log_dir_for_jobs") else '
|
|
1317
|
+
'job_lib.run_timestamp_with_globbing_payload(job_ids)',
|
|
1087
1318
|
'print(log_dirs, flush=True)',
|
|
1088
1319
|
]
|
|
1089
1320
|
return cls._build(code)
|