skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/skylet/job_lib.py
CHANGED
|
@@ -23,20 +23,22 @@ from sky import global_user_state
|
|
|
23
23
|
from sky import sky_logging
|
|
24
24
|
from sky.adaptors import common as adaptors_common
|
|
25
25
|
from sky.skylet import constants
|
|
26
|
+
from sky.skylet import runtime_utils
|
|
26
27
|
from sky.utils import common_utils
|
|
27
|
-
from sky.utils import log_utils
|
|
28
28
|
from sky.utils import message_utils
|
|
29
29
|
from sky.utils import subprocess_utils
|
|
30
30
|
from sky.utils.db import db_utils
|
|
31
31
|
|
|
32
32
|
if typing.TYPE_CHECKING:
|
|
33
33
|
import psutil
|
|
34
|
+
|
|
35
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
34
36
|
else:
|
|
35
37
|
psutil = adaptors_common.LazyImport('psutil')
|
|
38
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
36
39
|
|
|
37
40
|
logger = sky_logging.init_logger(__name__)
|
|
38
41
|
|
|
39
|
-
_LINUX_NEW_LINE = '\n'
|
|
40
42
|
_JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
|
|
41
43
|
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
|
42
44
|
# with pid is the same driver process to guard against the case where
|
|
@@ -82,13 +84,9 @@ def create_table(cursor, conn):
|
|
|
82
84
|
# is not critical and is likely to be enabled by other processes.
|
|
83
85
|
|
|
84
86
|
# Pid column is used for keeping track of the driver process of a job. It
|
|
85
|
-
# can be in
|
|
86
|
-
# -1: The job was submitted with SkyPilot older than #4318, where we use
|
|
87
|
-
# ray job submit to submit the job, i.e. no pid is recorded. This is for
|
|
88
|
-
# backward compatibility and should be removed after 0.10.0.
|
|
87
|
+
# can be in two states:
|
|
89
88
|
# 0: The job driver process has never been started. When adding a job with
|
|
90
|
-
# INIT state, the pid will be set to 0
|
|
91
|
-
# backward compatibility).
|
|
89
|
+
# INIT state, the pid will be set to 0.
|
|
92
90
|
# >=0: The job has been started. The pid is the driver process's pid.
|
|
93
91
|
# The driver can be actually running or finished.
|
|
94
92
|
# TODO(SKY-1213): username is actually user hash, should rename.
|
|
@@ -144,7 +142,7 @@ def init_db(func):
|
|
|
144
142
|
|
|
145
143
|
with _db_init_lock:
|
|
146
144
|
if _DB is None:
|
|
147
|
-
db_path =
|
|
145
|
+
db_path = runtime_utils.get_runtime_dir_path('.sky/jobs.db')
|
|
148
146
|
os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
|
|
149
147
|
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
150
148
|
return func(*args, **kwargs)
|
|
@@ -220,6 +218,45 @@ class JobStatus(enum.Enum):
|
|
|
220
218
|
color = _JOB_STATUS_TO_COLOR[self]
|
|
221
219
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
222
220
|
|
|
221
|
+
@classmethod
|
|
222
|
+
def from_protobuf(
|
|
223
|
+
cls,
|
|
224
|
+
protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
|
|
225
|
+
"""Convert protobuf JobStatus enum to Python enum value."""
|
|
226
|
+
protobuf_to_enum = {
|
|
227
|
+
jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
|
|
228
|
+
jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
|
|
229
|
+
jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
|
|
230
|
+
jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
|
|
231
|
+
jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
|
|
232
|
+
jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
233
|
+
jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
|
|
234
|
+
jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
|
|
235
|
+
jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
236
|
+
jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
|
|
237
|
+
}
|
|
238
|
+
if protobuf_value not in protobuf_to_enum:
|
|
239
|
+
raise ValueError(
|
|
240
|
+
f'Unknown protobuf JobStatus value: {protobuf_value}')
|
|
241
|
+
return protobuf_to_enum[protobuf_value]
|
|
242
|
+
|
|
243
|
+
def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
|
|
244
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
245
|
+
enum_to_protobuf = {
|
|
246
|
+
JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
|
|
247
|
+
JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
|
|
248
|
+
JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
|
|
249
|
+
JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
|
|
250
|
+
JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
|
|
251
|
+
JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
|
|
252
|
+
JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
|
|
253
|
+
JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
|
|
254
|
+
JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
|
|
255
|
+
}
|
|
256
|
+
if self not in enum_to_protobuf:
|
|
257
|
+
raise ValueError(f'Unknown JobStatus value: {self}')
|
|
258
|
+
return enum_to_protobuf[self]
|
|
259
|
+
|
|
223
260
|
|
|
224
261
|
# We have two steps for job submissions:
|
|
225
262
|
# 1. Client reserve a job id from the job table by adding a INIT state job.
|
|
@@ -261,11 +298,7 @@ class JobScheduler:
|
|
|
261
298
|
f'WHERE job_id={job_id!r}'))
|
|
262
299
|
_DB.conn.commit()
|
|
263
300
|
pid = subprocess_utils.launch_new_process_tree(run_cmd)
|
|
264
|
-
|
|
265
|
-
# This is for the case where the job is submitted with SkyPilot older
|
|
266
|
-
# than #4318, using ray job submit.
|
|
267
|
-
if 'job submit' in run_cmd:
|
|
268
|
-
pid = -1
|
|
301
|
+
|
|
269
302
|
_DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
|
|
270
303
|
f'WHERE job_id={job_id!r}'))
|
|
271
304
|
_DB.conn.commit()
|
|
@@ -475,6 +508,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
475
508
|
|
|
476
509
|
@init_db
|
|
477
510
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
511
|
+
return message_utils.encode_payload(get_statuses(job_ids))
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
@init_db
|
|
515
|
+
def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
|
|
478
516
|
assert _DB is not None
|
|
479
517
|
# Per-job lock is not required here, since the staled job status will not
|
|
480
518
|
# affect the caller.
|
|
@@ -482,10 +520,51 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
|
482
520
|
rows = _DB.cursor.execute(
|
|
483
521
|
f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
|
|
484
522
|
job_ids)
|
|
485
|
-
statuses = {job_id: None for job_id in job_ids}
|
|
523
|
+
statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
|
|
486
524
|
for (job_id, status) in rows:
|
|
487
525
|
statuses[job_id] = status
|
|
488
|
-
return
|
|
526
|
+
return statuses
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@init_db
|
|
530
|
+
def get_jobs_info(user_hash: Optional[str] = None,
|
|
531
|
+
all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
|
|
532
|
+
"""Get detailed job information.
|
|
533
|
+
|
|
534
|
+
Similar to dump_job_queue but returns structured protobuf objects instead
|
|
535
|
+
of encoded strings.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
user_hash: The user hash to show jobs for. Show all the users if None.
|
|
539
|
+
all_jobs: Whether to show all jobs, not just the pending/running ones.
|
|
540
|
+
"""
|
|
541
|
+
assert _DB is not None
|
|
542
|
+
|
|
543
|
+
status_list: Optional[List[JobStatus]] = [
|
|
544
|
+
JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
|
|
545
|
+
]
|
|
546
|
+
if all_jobs:
|
|
547
|
+
status_list = None
|
|
548
|
+
|
|
549
|
+
jobs = _get_jobs(user_hash, status_list=status_list)
|
|
550
|
+
jobs_info = []
|
|
551
|
+
for job in jobs:
|
|
552
|
+
jobs_info.append(
|
|
553
|
+
jobsv1_pb2.JobInfo(job_id=job['job_id'],
|
|
554
|
+
job_name=job['job_name'],
|
|
555
|
+
username=job['username'],
|
|
556
|
+
submitted_at=job['submitted_at'],
|
|
557
|
+
status=job['status'].to_protobuf(),
|
|
558
|
+
run_timestamp=job['run_timestamp'],
|
|
559
|
+
start_at=job['start_at'],
|
|
560
|
+
end_at=job['end_at'],
|
|
561
|
+
resources=job['resources'],
|
|
562
|
+
pid=job['pid'],
|
|
563
|
+
log_path=os.path.join(
|
|
564
|
+
constants.SKY_LOGS_DIRECTORY,
|
|
565
|
+
job['run_timestamp']),
|
|
566
|
+
metadata=json.dumps(job['metadata'])))
|
|
567
|
+
return jobs_info
|
|
489
568
|
|
|
490
569
|
|
|
491
570
|
def load_statuses_payload(
|
|
@@ -524,16 +603,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
524
603
|
PENDING state.
|
|
525
604
|
|
|
526
605
|
The normal job duration will use `start_at` instead of `submitted_at` (in
|
|
527
|
-
`format_job_queue()`), because the job may stay in PENDING if
|
|
528
|
-
busy.
|
|
606
|
+
`table_utils.format_job_queue()`), because the job may stay in PENDING if
|
|
607
|
+
the cluster is busy.
|
|
608
|
+
"""
|
|
609
|
+
return message_utils.encode_payload(
|
|
610
|
+
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
@init_db
|
|
614
|
+
def get_job_submitted_or_ended_timestamp(
|
|
615
|
+
job_id: int, get_ended_time: bool) -> Optional[float]:
|
|
616
|
+
"""Get the job submitted timestamp.
|
|
617
|
+
|
|
618
|
+
Returns the raw timestamp or None if job doesn't exist.
|
|
529
619
|
"""
|
|
530
620
|
assert _DB is not None
|
|
531
621
|
field = 'end_at' if get_ended_time else 'submitted_at'
|
|
532
622
|
rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
|
|
533
623
|
(job_id,))
|
|
534
624
|
for (timestamp,) in rows:
|
|
535
|
-
return
|
|
536
|
-
return
|
|
625
|
+
return timestamp
|
|
626
|
+
return None
|
|
537
627
|
|
|
538
628
|
|
|
539
629
|
def get_ray_port():
|
|
@@ -542,7 +632,8 @@ def get_ray_port():
|
|
|
542
632
|
If the port file does not exist, the cluster was launched before #1790,
|
|
543
633
|
return the default port.
|
|
544
634
|
"""
|
|
545
|
-
port_path =
|
|
635
|
+
port_path = runtime_utils.get_runtime_dir_path(
|
|
636
|
+
constants.SKY_REMOTE_RAY_PORT_FILE)
|
|
546
637
|
if not os.path.exists(port_path):
|
|
547
638
|
return 6379
|
|
548
639
|
port = json.load(open(port_path, 'r', encoding='utf-8'))['ray_port']
|
|
@@ -555,7 +646,8 @@ def get_job_submission_port():
|
|
|
555
646
|
If the port file does not exist, the cluster was launched before #1790,
|
|
556
647
|
return the default port.
|
|
557
648
|
"""
|
|
558
|
-
port_path =
|
|
649
|
+
port_path = runtime_utils.get_runtime_dir_path(
|
|
650
|
+
constants.SKY_REMOTE_RAY_PORT_FILE)
|
|
559
651
|
if not os.path.exists(port_path):
|
|
560
652
|
return 8265
|
|
561
653
|
port = json.load(open(port_path, 'r',
|
|
@@ -673,7 +765,7 @@ def update_job_status(job_ids: List[int],
|
|
|
673
765
|
statuses = []
|
|
674
766
|
for job_id in job_ids:
|
|
675
767
|
# Per-job status lock is required because between the job status
|
|
676
|
-
# query and the job status update, the job status in the
|
|
768
|
+
# query and the job status update, the job status in the database
|
|
677
769
|
# can be modified by the generated ray program.
|
|
678
770
|
with filelock.FileLock(_get_lock_path(job_id)):
|
|
679
771
|
status = None
|
|
@@ -724,12 +816,6 @@ def update_job_status(job_ids: List[int],
|
|
|
724
816
|
'the job state is not in terminal states, setting '
|
|
725
817
|
'it to FAILED_DRIVER')
|
|
726
818
|
status = JobStatus.FAILED_DRIVER
|
|
727
|
-
elif job_pid < 0:
|
|
728
|
-
# TODO(zhwu): Backward compatibility, remove after 0.10.0.
|
|
729
|
-
# We set the job status to PENDING instead of actually
|
|
730
|
-
# checking ray job status and let the status in job table
|
|
731
|
-
# take effect in the later max.
|
|
732
|
-
status = JobStatus.PENDING
|
|
733
819
|
|
|
734
820
|
pending_job = _get_pending_job(job_id)
|
|
735
821
|
if pending_job is not None:
|
|
@@ -842,35 +928,6 @@ def is_cluster_idle() -> bool:
|
|
|
842
928
|
assert False, 'Should not reach here'
|
|
843
929
|
|
|
844
930
|
|
|
845
|
-
def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
846
|
-
"""Format the job queue for display.
|
|
847
|
-
|
|
848
|
-
Usage:
|
|
849
|
-
jobs = get_job_queue()
|
|
850
|
-
print(format_job_queue(jobs))
|
|
851
|
-
"""
|
|
852
|
-
job_table = log_utils.create_table([
|
|
853
|
-
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
854
|
-
'STATUS', 'LOG', 'GIT COMMIT'
|
|
855
|
-
])
|
|
856
|
-
for job in jobs:
|
|
857
|
-
job_table.add_row([
|
|
858
|
-
job['job_id'],
|
|
859
|
-
job['job_name'],
|
|
860
|
-
job['username'],
|
|
861
|
-
log_utils.readable_time_duration(job['submitted_at']),
|
|
862
|
-
log_utils.readable_time_duration(job['start_at']),
|
|
863
|
-
log_utils.readable_time_duration(job['start_at'],
|
|
864
|
-
job['end_at'],
|
|
865
|
-
absolute=True),
|
|
866
|
-
job['resources'],
|
|
867
|
-
job['status'].colored_str(),
|
|
868
|
-
job['log_path'],
|
|
869
|
-
job.get('metadata', {}).get('git_commit', '-'),
|
|
870
|
-
])
|
|
871
|
-
return job_table
|
|
872
|
-
|
|
873
|
-
|
|
874
931
|
def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
|
|
875
932
|
"""Get the job queue in encoded json format.
|
|
876
933
|
|
|
@@ -907,27 +964,6 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
907
964
|
return jobs
|
|
908
965
|
|
|
909
966
|
|
|
910
|
-
# TODO(zhwu): Backward compatibility for jobs submitted before #4318, remove
|
|
911
|
-
# after 0.10.0.
|
|
912
|
-
def _create_ray_job_submission_client():
|
|
913
|
-
"""Import the ray job submission client."""
|
|
914
|
-
try:
|
|
915
|
-
import ray # pylint: disable=import-outside-toplevel
|
|
916
|
-
except ImportError:
|
|
917
|
-
logger.error('Failed to import ray')
|
|
918
|
-
raise
|
|
919
|
-
try:
|
|
920
|
-
# pylint: disable=import-outside-toplevel
|
|
921
|
-
from ray import job_submission
|
|
922
|
-
except ImportError:
|
|
923
|
-
logger.error(
|
|
924
|
-
f'Failed to import job_submission with ray=={ray.__version__}')
|
|
925
|
-
raise
|
|
926
|
-
port = get_job_submission_port()
|
|
927
|
-
return job_submission.JobSubmissionClient(
|
|
928
|
-
address=f'http://127.0.0.1:{port}')
|
|
929
|
-
|
|
930
|
-
|
|
931
967
|
def _make_ray_job_id(sky_job_id: int) -> str:
|
|
932
968
|
return f'{sky_job_id}-{getpass.getuser()}'
|
|
933
969
|
|
|
@@ -947,6 +983,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
947
983
|
Encoded job IDs that are actually cancelled. Caller should use
|
|
948
984
|
message_utils.decode_payload() to parse.
|
|
949
985
|
"""
|
|
986
|
+
return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
|
|
987
|
+
user_hash))
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def cancel_jobs(jobs: Optional[List[int]],
|
|
991
|
+
cancel_all: bool = False,
|
|
992
|
+
user_hash: Optional[str] = None) -> List[int]:
|
|
950
993
|
job_records = []
|
|
951
994
|
all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
|
|
952
995
|
if jobs is None and not cancel_all:
|
|
@@ -989,18 +1032,6 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
989
1032
|
# We don't have to start a daemon to forcefully kill the process
|
|
990
1033
|
# as our job driver process will clean up the underlying
|
|
991
1034
|
# child processes.
|
|
992
|
-
elif job['pid'] < 0:
|
|
993
|
-
try:
|
|
994
|
-
# TODO(zhwu): Backward compatibility, remove after 0.10.0.
|
|
995
|
-
# The job was submitted with ray job submit before #4318.
|
|
996
|
-
job_client = _create_ray_job_submission_client()
|
|
997
|
-
job_client.stop_job(_make_ray_job_id(job['job_id']))
|
|
998
|
-
except RuntimeError as e:
|
|
999
|
-
# If the request to the job server fails, we should not
|
|
1000
|
-
# set the job to CANCELLED.
|
|
1001
|
-
if 'does not exist' not in str(e):
|
|
1002
|
-
logger.warning(str(e))
|
|
1003
|
-
continue
|
|
1004
1035
|
# Get the job status again to avoid race condition.
|
|
1005
1036
|
job_status = get_status_no_lock(job['job_id'])
|
|
1006
1037
|
if job_status in [
|
|
@@ -1010,7 +1041,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
1010
1041
|
cancelled_ids.append(job['job_id'])
|
|
1011
1042
|
|
|
1012
1043
|
scheduler.schedule_step()
|
|
1013
|
-
return
|
|
1044
|
+
return cancelled_ids
|
|
1014
1045
|
|
|
1015
1046
|
|
|
1016
1047
|
@init_db
|
|
@@ -1030,6 +1061,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
|
1030
1061
|
|
|
1031
1062
|
@init_db
|
|
1032
1063
|
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
1064
|
+
"""Returns the relative paths to the log files for jobs with globbing,
|
|
1065
|
+
encoded."""
|
|
1066
|
+
job_to_dir = get_job_log_dirs(job_ids)
|
|
1067
|
+
job_to_dir_str: Dict[str, str] = {}
|
|
1068
|
+
for job_id, log_dir in job_to_dir.items():
|
|
1069
|
+
job_to_dir_str[str(job_id)] = log_dir
|
|
1070
|
+
return message_utils.encode_payload(job_to_dir_str)
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
@init_db
|
|
1074
|
+
def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
|
|
1033
1075
|
"""Returns the relative paths to the log files for jobs with globbing."""
|
|
1034
1076
|
assert _DB is not None
|
|
1035
1077
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
|
@@ -1038,16 +1080,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
|
1038
1080
|
SELECT * FROM jobs
|
|
1039
1081
|
WHERE {query_str}""", job_ids)
|
|
1040
1082
|
rows = _DB.cursor.fetchall()
|
|
1041
|
-
job_to_dir = {}
|
|
1083
|
+
job_to_dir: Dict[int, str] = {}
|
|
1042
1084
|
for row in rows:
|
|
1043
1085
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
|
1044
1086
|
if row[JobInfoLoc.LOG_PATH.value]:
|
|
1045
|
-
job_to_dir[
|
|
1087
|
+
job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
|
|
1046
1088
|
else:
|
|
1047
1089
|
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
1048
|
-
job_to_dir[
|
|
1049
|
-
|
|
1050
|
-
return
|
|
1090
|
+
job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
1091
|
+
run_timestamp)
|
|
1092
|
+
return job_to_dir
|
|
1051
1093
|
|
|
1052
1094
|
|
|
1053
1095
|
class JobLibCodeGen:
|
|
@@ -1176,15 +1218,10 @@ class JobLibCodeGen:
|
|
|
1176
1218
|
f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
|
|
1177
1219
|
),
|
|
1178
1220
|
# Add a newline to leave the if indent block above.
|
|
1179
|
-
f'\
|
|
1180
|
-
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
|
1181
|
-
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
|
1221
|
+
f'\nlog_lib.tail_logs(job_id=job_id, log_dir=log_dir, managed_job_id={managed_job_id!r}, follow={follow}, tail={tail})',
|
|
1182
1222
|
# After tailing, check the job status and exit with appropriate code
|
|
1183
1223
|
'job_status = job_lib.get_status(job_id)',
|
|
1184
|
-
|
|
1185
|
-
# and older did not have JobExitCode, so we use 0 for those versions
|
|
1186
|
-
# TODO: Remove this special handling after 0.10.0.
|
|
1187
|
-
'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
|
|
1224
|
+
'exit_code = exceptions.JobExitCode.from_job_status(job_status)',
|
|
1188
1225
|
# Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
|
|
1189
1226
|
# exit with success (0) since fetching current logs is a successful operation.
|
|
1190
1227
|
# This prevents shell wrappers from printing "command terminated with exit code 101".
|
|
@@ -1236,4 +1273,5 @@ class JobLibCodeGen:
|
|
|
1236
1273
|
def _build(cls, code: List[str]) -> str:
|
|
1237
1274
|
code = cls._PREFIX + code
|
|
1238
1275
|
code = ';'.join(code)
|
|
1239
|
-
return f'{constants.
|
|
1276
|
+
return (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
1277
|
+
f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}')
|