skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
|
@@ -6,22 +6,40 @@ Concepts:
|
|
|
6
6
|
- Cluster handle: (non-user facing) an opaque backend handle for us to
|
|
7
7
|
interact with a cluster.
|
|
8
8
|
"""
|
|
9
|
+
import asyncio
|
|
10
|
+
import enum
|
|
11
|
+
import functools
|
|
9
12
|
import json
|
|
10
13
|
import os
|
|
11
|
-
import pathlib
|
|
12
14
|
import pickle
|
|
13
|
-
import
|
|
15
|
+
import re
|
|
16
|
+
import threading
|
|
14
17
|
import time
|
|
15
18
|
import typing
|
|
16
19
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
17
20
|
import uuid
|
|
18
21
|
|
|
22
|
+
import sqlalchemy
|
|
23
|
+
from sqlalchemy import exc as sqlalchemy_exc
|
|
24
|
+
from sqlalchemy import orm
|
|
25
|
+
from sqlalchemy.dialects import postgresql
|
|
26
|
+
from sqlalchemy.dialects import sqlite
|
|
27
|
+
from sqlalchemy.ext import asyncio as sql_async
|
|
28
|
+
from sqlalchemy.ext import declarative
|
|
29
|
+
|
|
19
30
|
from sky import models
|
|
20
31
|
from sky import sky_logging
|
|
32
|
+
from sky import skypilot_config
|
|
33
|
+
from sky.metrics import utils as metrics_lib
|
|
34
|
+
from sky.skylet import constants
|
|
35
|
+
from sky.utils import annotations
|
|
21
36
|
from sky.utils import common_utils
|
|
22
|
-
from sky.utils import
|
|
37
|
+
from sky.utils import context_utils
|
|
23
38
|
from sky.utils import registry
|
|
24
39
|
from sky.utils import status_lib
|
|
40
|
+
from sky.utils import yaml_utils
|
|
41
|
+
from sky.utils.db import db_utils
|
|
42
|
+
from sky.utils.db import migration_utils
|
|
25
43
|
|
|
26
44
|
if typing.TYPE_CHECKING:
|
|
27
45
|
from sky import backends
|
|
@@ -32,171 +50,593 @@ if typing.TYPE_CHECKING:
|
|
|
32
50
|
logger = sky_logging.init_logger(__name__)
|
|
33
51
|
|
|
34
52
|
_ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
53
|
+
_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
|
|
54
|
+
|
|
55
|
+
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
56
|
+
_SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
|
|
57
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
58
|
+
|
|
59
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
|
|
60
|
+
DEBUG_CLUSTER_EVENT_RETENTION_HOURS = 30 * 24.0
|
|
61
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
62
|
+
|
|
63
|
+
_UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS = [
|
|
64
|
+
# sqlite
|
|
65
|
+
'UNIQUE constraint failed',
|
|
66
|
+
# postgres
|
|
67
|
+
'duplicate key value violates unique constraint',
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
Base = declarative.declarative_base()
|
|
71
|
+
|
|
72
|
+
config_table = sqlalchemy.Table(
|
|
73
|
+
'config',
|
|
74
|
+
Base.metadata,
|
|
75
|
+
sqlalchemy.Column('key', sqlalchemy.Text, primary_key=True),
|
|
76
|
+
sqlalchemy.Column('value', sqlalchemy.Text),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
user_table = sqlalchemy.Table(
|
|
80
|
+
'users',
|
|
81
|
+
Base.metadata,
|
|
82
|
+
sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
|
|
83
|
+
sqlalchemy.Column('name', sqlalchemy.Text),
|
|
84
|
+
sqlalchemy.Column('password', sqlalchemy.Text),
|
|
85
|
+
sqlalchemy.Column('created_at', sqlalchemy.Integer),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
cluster_table = sqlalchemy.Table(
|
|
89
|
+
'clusters',
|
|
90
|
+
Base.metadata,
|
|
91
|
+
sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
|
|
92
|
+
sqlalchemy.Column('launched_at', sqlalchemy.Integer),
|
|
93
|
+
sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
|
|
94
|
+
sqlalchemy.Column('last_use', sqlalchemy.Text),
|
|
95
|
+
sqlalchemy.Column('status', sqlalchemy.Text),
|
|
96
|
+
sqlalchemy.Column('autostop', sqlalchemy.Integer, server_default='-1'),
|
|
97
|
+
sqlalchemy.Column('to_down', sqlalchemy.Integer, server_default='0'),
|
|
98
|
+
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
|
99
|
+
sqlalchemy.Column('owner', sqlalchemy.Text, server_default=None),
|
|
100
|
+
sqlalchemy.Column('cluster_hash', sqlalchemy.Text, server_default=None),
|
|
101
|
+
sqlalchemy.Column('storage_mounts_metadata',
|
|
102
|
+
sqlalchemy.LargeBinary,
|
|
103
|
+
server_default=None),
|
|
104
|
+
sqlalchemy.Column('cluster_ever_up', sqlalchemy.Integer,
|
|
105
|
+
server_default='0'),
|
|
106
|
+
sqlalchemy.Column('status_updated_at',
|
|
107
|
+
sqlalchemy.Integer,
|
|
108
|
+
server_default=None),
|
|
109
|
+
sqlalchemy.Column('config_hash', sqlalchemy.Text, server_default=None),
|
|
110
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
|
|
111
|
+
sqlalchemy.Column('workspace',
|
|
112
|
+
sqlalchemy.Text,
|
|
113
|
+
server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
|
|
114
|
+
sqlalchemy.Column('last_creation_yaml',
|
|
115
|
+
sqlalchemy.Text,
|
|
116
|
+
server_default=None),
|
|
117
|
+
sqlalchemy.Column('last_creation_command',
|
|
118
|
+
sqlalchemy.Text,
|
|
119
|
+
server_default=None),
|
|
120
|
+
sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
|
|
121
|
+
sqlalchemy.Column('provision_log_path',
|
|
122
|
+
sqlalchemy.Text,
|
|
123
|
+
server_default=None),
|
|
124
|
+
sqlalchemy.Column('skylet_ssh_tunnel_metadata',
|
|
125
|
+
sqlalchemy.LargeBinary,
|
|
126
|
+
server_default=None),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
storage_table = sqlalchemy.Table(
|
|
130
|
+
'storage',
|
|
131
|
+
Base.metadata,
|
|
132
|
+
sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
|
|
133
|
+
sqlalchemy.Column('launched_at', sqlalchemy.Integer),
|
|
134
|
+
sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
|
|
135
|
+
sqlalchemy.Column('last_use', sqlalchemy.Text),
|
|
136
|
+
sqlalchemy.Column('status', sqlalchemy.Text),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
volume_table = sqlalchemy.Table(
|
|
140
|
+
'volumes',
|
|
141
|
+
Base.metadata,
|
|
142
|
+
sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
|
|
143
|
+
sqlalchemy.Column('launched_at', sqlalchemy.Integer),
|
|
144
|
+
sqlalchemy.Column('handle', sqlalchemy.LargeBinary),
|
|
145
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text, server_default=None),
|
|
146
|
+
sqlalchemy.Column('workspace',
|
|
147
|
+
sqlalchemy.Text,
|
|
148
|
+
server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
|
|
149
|
+
sqlalchemy.Column('last_attached_at',
|
|
150
|
+
sqlalchemy.Integer,
|
|
151
|
+
server_default=None),
|
|
152
|
+
sqlalchemy.Column('last_use', sqlalchemy.Text),
|
|
153
|
+
sqlalchemy.Column('status', sqlalchemy.Text),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Table for Cluster History
|
|
157
|
+
# usage_intervals: List[Tuple[int, int]]
|
|
158
|
+
# Specifies start and end timestamps of cluster.
|
|
159
|
+
# When the last end time is None, the cluster is still UP.
|
|
160
|
+
# Example: [(start1, end1), (start2, end2), (start3, None)]
|
|
161
|
+
|
|
162
|
+
# requested_resources: Set[resource_lib.Resource]
|
|
163
|
+
# Requested resources fetched from task that user specifies.
|
|
164
|
+
|
|
165
|
+
# launched_resources: Optional[resources_lib.Resources]
|
|
166
|
+
# Actual launched resources fetched from handle for cluster.
|
|
167
|
+
|
|
168
|
+
# num_nodes: Optional[int] number of nodes launched.
|
|
169
|
+
cluster_history_table = sqlalchemy.Table(
|
|
170
|
+
'cluster_history',
|
|
171
|
+
Base.metadata,
|
|
172
|
+
sqlalchemy.Column('cluster_hash', sqlalchemy.Text, primary_key=True),
|
|
173
|
+
sqlalchemy.Column('name', sqlalchemy.Text),
|
|
174
|
+
sqlalchemy.Column('num_nodes', sqlalchemy.Integer),
|
|
175
|
+
sqlalchemy.Column('requested_resources', sqlalchemy.LargeBinary),
|
|
176
|
+
sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
|
|
177
|
+
sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
|
|
178
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text),
|
|
179
|
+
sqlalchemy.Column('last_creation_yaml',
|
|
180
|
+
sqlalchemy.Text,
|
|
181
|
+
server_default=None),
|
|
182
|
+
sqlalchemy.Column('last_creation_command',
|
|
183
|
+
sqlalchemy.Text,
|
|
184
|
+
server_default=None),
|
|
185
|
+
sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
|
|
186
|
+
sqlalchemy.Column('provision_log_path',
|
|
187
|
+
sqlalchemy.Text,
|
|
188
|
+
server_default=None),
|
|
189
|
+
sqlalchemy.Column('last_activity_time',
|
|
190
|
+
sqlalchemy.Integer,
|
|
191
|
+
server_default=None,
|
|
192
|
+
index=True),
|
|
193
|
+
sqlalchemy.Column('launched_at',
|
|
194
|
+
sqlalchemy.Integer,
|
|
195
|
+
server_default=None,
|
|
196
|
+
index=True),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class ClusterEventType(enum.Enum):
|
|
201
|
+
"""Type of cluster event."""
|
|
202
|
+
DEBUG = 'DEBUG'
|
|
203
|
+
"""Used to denote events that are not related to cluster status."""
|
|
204
|
+
|
|
205
|
+
STATUS_CHANGE = 'STATUS_CHANGE'
|
|
206
|
+
"""Used to denote events that modify cluster status."""
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# Table for cluster status change events.
|
|
210
|
+
# starting_status: Status of the cluster at the start of the event.
|
|
211
|
+
# ending_status: Status of the cluster at the end of the event.
|
|
212
|
+
# reason: Reason for the transition.
|
|
213
|
+
# transitioned_at: Timestamp of the transition.
|
|
214
|
+
cluster_event_table = sqlalchemy.Table(
|
|
215
|
+
'cluster_events',
|
|
216
|
+
Base.metadata,
|
|
217
|
+
sqlalchemy.Column('cluster_hash', sqlalchemy.Text, primary_key=True),
|
|
218
|
+
sqlalchemy.Column('name', sqlalchemy.Text),
|
|
219
|
+
sqlalchemy.Column('starting_status', sqlalchemy.Text),
|
|
220
|
+
sqlalchemy.Column('ending_status', sqlalchemy.Text),
|
|
221
|
+
sqlalchemy.Column('reason', sqlalchemy.Text, primary_key=True),
|
|
222
|
+
sqlalchemy.Column('transitioned_at', sqlalchemy.Integer, primary_key=True),
|
|
223
|
+
sqlalchemy.Column('type', sqlalchemy.Text),
|
|
224
|
+
sqlalchemy.Column('request_id', sqlalchemy.Text, server_default=None),
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
ssh_key_table = sqlalchemy.Table(
|
|
228
|
+
'ssh_key',
|
|
229
|
+
Base.metadata,
|
|
230
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text, primary_key=True),
|
|
231
|
+
sqlalchemy.Column('ssh_public_key', sqlalchemy.Text),
|
|
232
|
+
sqlalchemy.Column('ssh_private_key', sqlalchemy.Text),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
service_account_token_table = sqlalchemy.Table(
|
|
236
|
+
'service_account_tokens',
|
|
237
|
+
Base.metadata,
|
|
238
|
+
sqlalchemy.Column('token_id', sqlalchemy.Text, primary_key=True),
|
|
239
|
+
sqlalchemy.Column('token_name', sqlalchemy.Text),
|
|
240
|
+
sqlalchemy.Column('token_hash', sqlalchemy.Text),
|
|
241
|
+
sqlalchemy.Column('created_at', sqlalchemy.Integer),
|
|
242
|
+
sqlalchemy.Column('last_used_at', sqlalchemy.Integer, server_default=None),
|
|
243
|
+
sqlalchemy.Column('expires_at', sqlalchemy.Integer, server_default=None),
|
|
244
|
+
sqlalchemy.Column('creator_user_hash',
|
|
245
|
+
sqlalchemy.Text), # Who created this token
|
|
246
|
+
sqlalchemy.Column('service_account_user_id',
|
|
247
|
+
sqlalchemy.Text), # Service account's own user ID
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
cluster_yaml_table = sqlalchemy.Table(
|
|
251
|
+
'cluster_yaml',
|
|
252
|
+
Base.metadata,
|
|
253
|
+
sqlalchemy.Column('cluster_name', sqlalchemy.Text, primary_key=True),
|
|
254
|
+
sqlalchemy.Column('yaml', sqlalchemy.Text),
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
system_config_table = sqlalchemy.Table(
|
|
258
|
+
'system_config',
|
|
259
|
+
Base.metadata,
|
|
260
|
+
sqlalchemy.Column('config_key', sqlalchemy.Text, primary_key=True),
|
|
261
|
+
sqlalchemy.Column('config_value', sqlalchemy.Text),
|
|
262
|
+
sqlalchemy.Column('created_at', sqlalchemy.Integer),
|
|
263
|
+
sqlalchemy.Column('updated_at', sqlalchemy.Integer),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _glob_to_similar(glob_pattern):
|
|
268
|
+
"""Converts a glob pattern to a PostgreSQL LIKE pattern."""
|
|
269
|
+
|
|
270
|
+
# Escape special LIKE characters that are not special in glob
|
|
271
|
+
glob_pattern = glob_pattern.replace('%', '\\%').replace('_', '\\_')
|
|
272
|
+
|
|
273
|
+
# Convert glob wildcards to LIKE wildcards
|
|
274
|
+
like_pattern = glob_pattern.replace('*', '%').replace('?', '_')
|
|
275
|
+
|
|
276
|
+
# Handle character classes, including negation
|
|
277
|
+
def replace_char_class(match):
|
|
278
|
+
group = match.group(0)
|
|
279
|
+
if group.startswith('[!'):
|
|
280
|
+
return '[^' + group[2:-1] + ']'
|
|
281
|
+
return group
|
|
282
|
+
|
|
283
|
+
like_pattern = re.sub(r'\[(!)?.*?\]', replace_char_class, like_pattern)
|
|
284
|
+
return like_pattern
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def create_table(engine: sqlalchemy.engine.Engine):
|
|
41
288
|
# Enable WAL mode to avoid locking issues.
|
|
42
289
|
# See: issue #1441 and PR #1509
|
|
43
290
|
# https://github.com/microsoft/WSL/issues/2395
|
|
44
291
|
# TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
|
|
45
292
|
# This may cause the database locked problem from WSL issue #1441.
|
|
46
|
-
if
|
|
293
|
+
if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
|
|
294
|
+
not common_utils.is_wsl()):
|
|
47
295
|
try:
|
|
48
|
-
|
|
49
|
-
|
|
296
|
+
with orm.Session(engine) as session:
|
|
297
|
+
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
|
298
|
+
session.commit()
|
|
299
|
+
except sqlalchemy_exc.OperationalError as e:
|
|
50
300
|
if 'database is locked' not in str(e):
|
|
51
301
|
raise
|
|
52
302
|
# If the database is locked, it is OK to continue, as the WAL mode
|
|
53
303
|
# is not critical and is likely to be enabled by other processes.
|
|
54
304
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
name TEXT PRIMARY KEY,
|
|
59
|
-
launched_at INTEGER,
|
|
60
|
-
handle BLOB,
|
|
61
|
-
last_use TEXT,
|
|
62
|
-
status TEXT,
|
|
63
|
-
autostop INTEGER DEFAULT -1,
|
|
64
|
-
metadata TEXT DEFAULT '{}',
|
|
65
|
-
to_down INTEGER DEFAULT 0,
|
|
66
|
-
owner TEXT DEFAULT null,
|
|
67
|
-
cluster_hash TEXT DEFAULT null,
|
|
68
|
-
storage_mounts_metadata BLOB DEFAULT null,
|
|
69
|
-
cluster_ever_up INTEGER DEFAULT 0,
|
|
70
|
-
status_updated_at INTEGER DEFAULT null,
|
|
71
|
-
config_hash TEXT DEFAULT null,
|
|
72
|
-
user_hash TEXT DEFAULT null)""")
|
|
73
|
-
|
|
74
|
-
# Table for Cluster History
|
|
75
|
-
# usage_intervals: List[Tuple[int, int]]
|
|
76
|
-
# Specifies start and end timestamps of cluster.
|
|
77
|
-
# When the last end time is None, the cluster is still UP.
|
|
78
|
-
# Example: [(start1, end1), (start2, end2), (start3, None)]
|
|
79
|
-
|
|
80
|
-
# requested_resources: Set[resource_lib.Resource]
|
|
81
|
-
# Requested resources fetched from task that user specifies.
|
|
82
|
-
|
|
83
|
-
# launched_resources: Optional[resources_lib.Resources]
|
|
84
|
-
# Actual launched resources fetched from handle for cluster.
|
|
85
|
-
|
|
86
|
-
# num_nodes: Optional[int] number of nodes launched.
|
|
87
|
-
|
|
88
|
-
cursor.execute("""\
|
|
89
|
-
CREATE TABLE IF NOT EXISTS cluster_history (
|
|
90
|
-
cluster_hash TEXT PRIMARY KEY,
|
|
91
|
-
name TEXT,
|
|
92
|
-
num_nodes int,
|
|
93
|
-
requested_resources BLOB,
|
|
94
|
-
launched_resources BLOB,
|
|
95
|
-
usage_intervals BLOB,
|
|
96
|
-
user_hash TEXT)""")
|
|
97
|
-
# Table for configs (e.g. enabled clouds)
|
|
98
|
-
cursor.execute("""\
|
|
99
|
-
CREATE TABLE IF NOT EXISTS config (
|
|
100
|
-
key TEXT PRIMARY KEY, value TEXT)""")
|
|
101
|
-
# Table for Storage
|
|
102
|
-
cursor.execute("""\
|
|
103
|
-
CREATE TABLE IF NOT EXISTS storage (
|
|
104
|
-
name TEXT PRIMARY KEY,
|
|
105
|
-
launched_at INTEGER,
|
|
106
|
-
handle BLOB,
|
|
107
|
-
last_use TEXT,
|
|
108
|
-
status TEXT)""")
|
|
109
|
-
# Table for User
|
|
110
|
-
cursor.execute("""\
|
|
111
|
-
CREATE TABLE IF NOT EXISTS users (
|
|
112
|
-
id TEXT PRIMARY KEY,
|
|
113
|
-
name TEXT)""")
|
|
114
|
-
# For backward compatibility.
|
|
115
|
-
# TODO(zhwu): Remove this function after all users have migrated to
|
|
116
|
-
# the latest version of SkyPilot.
|
|
117
|
-
# Add autostop column to clusters table
|
|
118
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'autostop',
|
|
119
|
-
'INTEGER DEFAULT -1')
|
|
120
|
-
|
|
121
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'metadata',
|
|
122
|
-
'TEXT DEFAULT \'{}\'')
|
|
123
|
-
|
|
124
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down',
|
|
125
|
-
'INTEGER DEFAULT 0')
|
|
126
|
-
|
|
127
|
-
# The cloud identity that created the cluster.
|
|
128
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'owner', 'TEXT')
|
|
129
|
-
|
|
130
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash',
|
|
131
|
-
'TEXT DEFAULT null')
|
|
132
|
-
|
|
133
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters',
|
|
134
|
-
'storage_mounts_metadata', 'BLOB DEFAULT null')
|
|
135
|
-
db_utils.add_column_to_table(
|
|
136
|
-
cursor,
|
|
137
|
-
conn,
|
|
138
|
-
'clusters',
|
|
139
|
-
'cluster_ever_up',
|
|
140
|
-
'INTEGER DEFAULT 0',
|
|
141
|
-
# Set the value to 1 so that all the existing clusters before #2977
|
|
142
|
-
# are considered as ever up, i.e:
|
|
143
|
-
# existing cluster's default (null) -> 1;
|
|
144
|
-
# new cluster's default -> 0;
|
|
145
|
-
# This is conservative for the existing clusters: even if some INIT
|
|
146
|
-
# clusters were never really UP, setting it to 1 means they won't be
|
|
147
|
-
# auto-deleted during any failover.
|
|
148
|
-
value_to_replace_existing_entries=1)
|
|
149
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
|
|
150
|
-
'INTEGER DEFAULT null')
|
|
151
|
-
db_utils.add_column_to_table(
|
|
152
|
-
cursor,
|
|
153
|
-
conn,
|
|
154
|
-
'clusters',
|
|
155
|
-
'user_hash',
|
|
156
|
-
'TEXT DEFAULT null',
|
|
157
|
-
value_to_replace_existing_entries=common_utils.get_user_hash())
|
|
158
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
|
159
|
-
'TEXT DEFAULT null')
|
|
160
|
-
|
|
161
|
-
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
|
162
|
-
'TEXT DEFAULT null')
|
|
163
|
-
|
|
164
|
-
db_utils.add_column_to_table(cursor, conn, 'cluster_history', 'user_hash',
|
|
165
|
-
'TEXT DEFAULT null')
|
|
166
|
-
conn.commit()
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
_DB = db_utils.SQLiteConn(_DB_PATH, create_table)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def add_or_update_user(user: models.User):
|
|
173
|
-
"""Store the mapping from user hash to user name for display purposes."""
|
|
174
|
-
if user.name is None:
|
|
175
|
-
return
|
|
176
|
-
_DB.cursor.execute('INSERT OR REPLACE INTO users (id, name) VALUES (?, ?)',
|
|
177
|
-
(user.id, user.name))
|
|
178
|
-
_DB.conn.commit()
|
|
305
|
+
migration_utils.safe_alembic_upgrade(
|
|
306
|
+
engine, migration_utils.GLOBAL_USER_STATE_DB_NAME,
|
|
307
|
+
migration_utils.GLOBAL_USER_STATE_VERSION)
|
|
179
308
|
|
|
180
309
|
|
|
181
|
-
def
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
310
|
+
def initialize_and_get_db_async() -> sql_async.AsyncEngine:
|
|
311
|
+
global _SQLALCHEMY_ENGINE_ASYNC
|
|
312
|
+
if _SQLALCHEMY_ENGINE_ASYNC is not None:
|
|
313
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
314
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
315
|
+
if _SQLALCHEMY_ENGINE_ASYNC is not None:
|
|
316
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
187
317
|
|
|
318
|
+
_SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('state',
|
|
319
|
+
async_engine=True)
|
|
320
|
+
initialize_and_get_db()
|
|
321
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
188
322
|
|
|
189
|
-
def get_all_users() -> List[models.User]:
|
|
190
|
-
rows = _DB.cursor.execute('SELECT id, name FROM users').fetchall()
|
|
191
|
-
return [models.User(id=row[0], name=row[1]) for row in rows]
|
|
192
323
|
|
|
324
|
+
# We wrap the sqlalchemy engine initialization in a thread
|
|
325
|
+
# lock to ensure that multiple threads do not initialize the
|
|
326
|
+
# engine which could result in a rare race condition where
|
|
327
|
+
# a session has already been created with _SQLALCHEMY_ENGINE = e1,
|
|
328
|
+
# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
|
|
329
|
+
# which could result in e1 being garbage collected unexpectedly.
|
|
330
|
+
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
331
|
+
global _SQLALCHEMY_ENGINE
|
|
332
|
+
|
|
333
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
334
|
+
return _SQLALCHEMY_ENGINE
|
|
335
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
336
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
337
|
+
return _SQLALCHEMY_ENGINE
|
|
338
|
+
# get an engine to the db
|
|
339
|
+
engine = db_utils.get_engine('state')
|
|
340
|
+
|
|
341
|
+
# run migrations if needed
|
|
342
|
+
create_table(engine)
|
|
343
|
+
|
|
344
|
+
# return engine
|
|
345
|
+
_SQLALCHEMY_ENGINE = engine
|
|
346
|
+
# Cache the result of _sqlite_supports_returning()
|
|
347
|
+
# ahead of time, as it won't change throughout
|
|
348
|
+
# the lifetime of the engine.
|
|
349
|
+
_sqlite_supports_returning()
|
|
350
|
+
return _SQLALCHEMY_ENGINE
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _init_db_async(func):
|
|
354
|
+
"""Initialize the async database."""
|
|
355
|
+
|
|
356
|
+
@functools.wraps(func)
|
|
357
|
+
async def wrapper(*args, **kwargs):
|
|
358
|
+
if _SQLALCHEMY_ENGINE_ASYNC is None:
|
|
359
|
+
# this may happen multiple times since there is no locking
|
|
360
|
+
# here but thats fine, this is just a short circuit for the
|
|
361
|
+
# common case.
|
|
362
|
+
await context_utils.to_thread(initialize_and_get_db_async)
|
|
193
363
|
|
|
364
|
+
return await func(*args, **kwargs)
|
|
365
|
+
|
|
366
|
+
return wrapper
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _init_db(func):
|
|
370
|
+
"""Initialize the database."""
|
|
371
|
+
|
|
372
|
+
@functools.wraps(func)
|
|
373
|
+
def wrapper(*args, **kwargs):
|
|
374
|
+
initialize_and_get_db()
|
|
375
|
+
return func(*args, **kwargs)
|
|
376
|
+
|
|
377
|
+
return wrapper
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
381
|
+
def _sqlite_supports_returning() -> bool:
|
|
382
|
+
"""Check if SQLite (3.35.0+) and SQLAlchemy (2.0+) support RETURNING.
|
|
383
|
+
|
|
384
|
+
See https://sqlite.org/lang_returning.html and
|
|
385
|
+
https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#insert-update-delete-returning # pylint: disable=line-too-long
|
|
386
|
+
"""
|
|
387
|
+
sqlalchemy_version_parts = sqlalchemy.__version__.split('.')
|
|
388
|
+
assert len(sqlalchemy_version_parts) >= 1, \
|
|
389
|
+
f'Invalid SQLAlchemy version: {sqlalchemy.__version__}'
|
|
390
|
+
sqlalchemy_major = int(sqlalchemy_version_parts[0])
|
|
391
|
+
if sqlalchemy_major < 2:
|
|
392
|
+
return False
|
|
393
|
+
|
|
394
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
395
|
+
if (_SQLALCHEMY_ENGINE.dialect.name !=
|
|
396
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
397
|
+
return False
|
|
398
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
399
|
+
result = session.execute(sqlalchemy.text('SELECT sqlite_version()'))
|
|
400
|
+
version_str = result.scalar()
|
|
401
|
+
version_parts = version_str.split('.')
|
|
402
|
+
assert len(version_parts) >= 2, \
|
|
403
|
+
f'Invalid version string: {version_str}'
|
|
404
|
+
major, minor = int(version_parts[0]), int(version_parts[1])
|
|
405
|
+
return (major > 3) or (major == 3 and minor >= 35)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
@_init_db
|
|
409
|
+
@metrics_lib.time_me
|
|
410
|
+
def add_or_update_user(
|
|
411
|
+
user: models.User,
|
|
412
|
+
allow_duplicate_name: bool = True,
|
|
413
|
+
return_user: bool = False
|
|
414
|
+
) -> typing.Union[bool, typing.Tuple[bool, models.User]]:
|
|
415
|
+
"""Store the mapping from user hash to user name for display purposes.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
If return_user=False: bool (whether the user is newly added)
|
|
419
|
+
If return_user=True: Tuple[bool, models.User]
|
|
420
|
+
"""
|
|
421
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
422
|
+
|
|
423
|
+
if user.name is None:
|
|
424
|
+
return (False, user) if return_user else False
|
|
425
|
+
|
|
426
|
+
# Set created_at if not already set
|
|
427
|
+
created_at = user.created_at
|
|
428
|
+
if created_at is None:
|
|
429
|
+
created_at = int(time.time())
|
|
430
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
431
|
+
# Check for duplicate names if not allowed (within the same transaction)
|
|
432
|
+
if not allow_duplicate_name:
|
|
433
|
+
existing_user = session.query(user_table).filter(
|
|
434
|
+
user_table.c.name == user.name).first()
|
|
435
|
+
if existing_user is not None:
|
|
436
|
+
return (False, user) if return_user else False
|
|
437
|
+
|
|
438
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
439
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
440
|
+
# For SQLite, use INSERT OR IGNORE followed by UPDATE to detect new
|
|
441
|
+
# vs existing
|
|
442
|
+
insert_func = sqlite.insert
|
|
443
|
+
|
|
444
|
+
# First try INSERT OR IGNORE - this won't fail if user exists
|
|
445
|
+
insert_stmnt = insert_func(user_table).prefix_with(
|
|
446
|
+
'OR IGNORE').values(id=user.id,
|
|
447
|
+
name=user.name,
|
|
448
|
+
password=user.password,
|
|
449
|
+
created_at=created_at)
|
|
450
|
+
use_returning = return_user and _sqlite_supports_returning()
|
|
451
|
+
if use_returning:
|
|
452
|
+
insert_stmnt = insert_stmnt.returning(
|
|
453
|
+
user_table.c.id,
|
|
454
|
+
user_table.c.name,
|
|
455
|
+
user_table.c.password,
|
|
456
|
+
user_table.c.created_at,
|
|
457
|
+
)
|
|
458
|
+
result = session.execute(insert_stmnt)
|
|
459
|
+
|
|
460
|
+
row = None
|
|
461
|
+
if use_returning:
|
|
462
|
+
# With RETURNING, check if we got a row back.
|
|
463
|
+
row = result.fetchone()
|
|
464
|
+
was_inserted = row is not None
|
|
465
|
+
else:
|
|
466
|
+
# Without RETURNING, use rowcount.
|
|
467
|
+
was_inserted = result.rowcount > 0
|
|
468
|
+
|
|
469
|
+
if not was_inserted:
|
|
470
|
+
# User existed, so update it (but don't update created_at)
|
|
471
|
+
update_values = {user_table.c.name: user.name}
|
|
472
|
+
if user.password:
|
|
473
|
+
update_values[user_table.c.password] = user.password
|
|
474
|
+
|
|
475
|
+
update_stmnt = sqlalchemy.update(user_table).where(
|
|
476
|
+
user_table.c.id == user.id).values(update_values)
|
|
477
|
+
if use_returning:
|
|
478
|
+
update_stmnt = update_stmnt.returning(
|
|
479
|
+
user_table.c.id, user_table.c.name,
|
|
480
|
+
user_table.c.password, user_table.c.created_at)
|
|
481
|
+
|
|
482
|
+
result = session.execute(update_stmnt)
|
|
483
|
+
if use_returning:
|
|
484
|
+
row = result.fetchone()
|
|
485
|
+
|
|
486
|
+
session.commit()
|
|
487
|
+
|
|
488
|
+
if return_user:
|
|
489
|
+
if row is None:
|
|
490
|
+
# row=None means the sqlite used has no RETURNING support,
|
|
491
|
+
# so we need to do a separate query
|
|
492
|
+
row = session.query(user_table).filter_by(
|
|
493
|
+
id=user.id).first()
|
|
494
|
+
updated_user = models.User(id=row.id,
|
|
495
|
+
name=row.name,
|
|
496
|
+
password=row.password,
|
|
497
|
+
created_at=row.created_at)
|
|
498
|
+
return was_inserted, updated_user
|
|
499
|
+
else:
|
|
500
|
+
return was_inserted
|
|
501
|
+
|
|
502
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
503
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
504
|
+
# For PostgreSQL, use INSERT ... ON CONFLICT with RETURNING to
|
|
505
|
+
# detect insert vs update
|
|
506
|
+
insert_func = postgresql.insert
|
|
507
|
+
|
|
508
|
+
insert_stmnt = insert_func(user_table).values(
|
|
509
|
+
id=user.id,
|
|
510
|
+
name=user.name,
|
|
511
|
+
password=user.password,
|
|
512
|
+
created_at=created_at)
|
|
513
|
+
|
|
514
|
+
# Use a sentinel in the RETURNING clause to detect insert vs update
|
|
515
|
+
if user.password:
|
|
516
|
+
set_ = {
|
|
517
|
+
user_table.c.name: user.name,
|
|
518
|
+
user_table.c.password: user.password
|
|
519
|
+
}
|
|
520
|
+
else:
|
|
521
|
+
set_ = {user_table.c.name: user.name}
|
|
522
|
+
upsert_stmnt = insert_stmnt.on_conflict_do_update(
|
|
523
|
+
index_elements=[user_table.c.id], set_=set_).returning(
|
|
524
|
+
user_table.c.id,
|
|
525
|
+
user_table.c.name,
|
|
526
|
+
user_table.c.password,
|
|
527
|
+
user_table.c.created_at,
|
|
528
|
+
# This will be True for INSERT, False for UPDATE
|
|
529
|
+
sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
|
|
530
|
+
))
|
|
531
|
+
|
|
532
|
+
result = session.execute(upsert_stmnt)
|
|
533
|
+
row = result.fetchone()
|
|
534
|
+
|
|
535
|
+
was_inserted = bool(row.was_inserted) if row else False
|
|
536
|
+
session.commit()
|
|
537
|
+
|
|
538
|
+
if return_user:
|
|
539
|
+
updated_user = models.User(id=row.id,
|
|
540
|
+
name=row.name,
|
|
541
|
+
password=row.password,
|
|
542
|
+
created_at=row.created_at)
|
|
543
|
+
return was_inserted, updated_user
|
|
544
|
+
else:
|
|
545
|
+
return was_inserted
|
|
546
|
+
else:
|
|
547
|
+
raise ValueError('Unsupported database dialect')
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
@_init_db
|
|
551
|
+
@metrics_lib.time_me
|
|
552
|
+
def get_user(user_id: str) -> Optional[models.User]:
|
|
553
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
554
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
555
|
+
row = session.query(user_table).filter_by(id=user_id).first()
|
|
556
|
+
if row is None:
|
|
557
|
+
return None
|
|
558
|
+
return models.User(id=row.id,
|
|
559
|
+
name=row.name,
|
|
560
|
+
password=row.password,
|
|
561
|
+
created_at=row.created_at)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
@_init_db
|
|
565
|
+
@metrics_lib.time_me
|
|
566
|
+
def get_users(user_ids: Set[str]) -> Dict[str, models.User]:
|
|
567
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
568
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
569
|
+
rows = session.query(user_table).filter(
|
|
570
|
+
user_table.c.id.in_(user_ids)).all()
|
|
571
|
+
return {
|
|
572
|
+
row.id: models.User(id=row.id,
|
|
573
|
+
name=row.name,
|
|
574
|
+
password=row.password,
|
|
575
|
+
created_at=row.created_at) for row in rows
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
@_init_db
|
|
580
|
+
@metrics_lib.time_me
|
|
581
|
+
def get_user_by_name(username: str) -> List[models.User]:
|
|
582
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
583
|
+
rows = session.query(user_table).filter_by(name=username).all()
|
|
584
|
+
if len(rows) == 0:
|
|
585
|
+
return []
|
|
586
|
+
return [
|
|
587
|
+
models.User(id=row.id,
|
|
588
|
+
name=row.name,
|
|
589
|
+
password=row.password,
|
|
590
|
+
created_at=row.created_at) for row in rows
|
|
591
|
+
]
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@_init_db
|
|
595
|
+
@metrics_lib.time_me
|
|
596
|
+
def get_user_by_name_match(username_match: str) -> List[models.User]:
|
|
597
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
598
|
+
rows = session.query(user_table).filter(
|
|
599
|
+
user_table.c.name.like(f'%{username_match}%')).all()
|
|
600
|
+
return [
|
|
601
|
+
models.User(id=row.id, name=row.name, created_at=row.created_at)
|
|
602
|
+
for row in rows
|
|
603
|
+
]
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
@_init_db
|
|
607
|
+
@metrics_lib.time_me
|
|
608
|
+
def delete_user(user_id: str) -> None:
|
|
609
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
610
|
+
session.query(user_table).filter_by(id=user_id).delete()
|
|
611
|
+
session.commit()
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
@_init_db
|
|
615
|
+
@metrics_lib.time_me
|
|
616
|
+
def get_all_users() -> List[models.User]:
|
|
617
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
618
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
619
|
+
rows = session.query(user_table).all()
|
|
620
|
+
return [
|
|
621
|
+
models.User(id=row.id,
|
|
622
|
+
name=row.name,
|
|
623
|
+
password=row.password,
|
|
624
|
+
created_at=row.created_at) for row in rows
|
|
625
|
+
]
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
@_init_db
|
|
629
|
+
@metrics_lib.time_me
|
|
194
630
|
def add_or_update_cluster(cluster_name: str,
|
|
195
631
|
cluster_handle: 'backends.ResourceHandle',
|
|
196
632
|
requested_resources: Optional[Set[Any]],
|
|
197
633
|
ready: bool,
|
|
198
634
|
is_launch: bool = True,
|
|
199
|
-
config_hash: Optional[str] = None
|
|
635
|
+
config_hash: Optional[str] = None,
|
|
636
|
+
task_config: Optional[Dict[str, Any]] = None,
|
|
637
|
+
is_managed: bool = False,
|
|
638
|
+
provision_log_path: Optional[str] = None,
|
|
639
|
+
existing_cluster_hash: Optional[str] = None):
|
|
200
640
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
|
201
641
|
|
|
202
642
|
Args:
|
|
@@ -207,7 +647,17 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
207
647
|
be marked as INIT, otherwise it will be marked as UP.
|
|
208
648
|
is_launch: if the cluster is firstly launched. If True, the launched_at
|
|
209
649
|
and last_use will be updated. Otherwise, use the old value.
|
|
650
|
+
config_hash: Configuration hash for the cluster.
|
|
651
|
+
task_config: The config of the task being launched.
|
|
652
|
+
is_managed: Whether the cluster is launched by the
|
|
653
|
+
controller.
|
|
654
|
+
provision_log_path: Absolute path to provision.log, if available.
|
|
655
|
+
existing_cluster_hash: If specified, the cluster will be updated
|
|
656
|
+
only if the cluster_hash matches. If a cluster does not exist,
|
|
657
|
+
it will not be inserted and an error will be raised.
|
|
210
658
|
"""
|
|
659
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
660
|
+
|
|
211
661
|
# FIXME: launched_at will be changed when `sky launch -c` is called.
|
|
212
662
|
handle = pickle.dumps(cluster_handle)
|
|
213
663
|
cluster_launched_at = int(time.time()) if is_launch else None
|
|
@@ -240,143 +690,362 @@ def add_or_update_cluster(cluster_name: str,
|
|
|
240
690
|
cluster_launched_at = int(time.time())
|
|
241
691
|
usage_intervals.append((cluster_launched_at, None))
|
|
242
692
|
|
|
243
|
-
user_hash = common_utils.
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
#
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
(
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
launched_nodes,
|
|
369
|
-
|
|
370
|
-
pickle.dumps(
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
693
|
+
user_hash = common_utils.get_current_user().id
|
|
694
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
695
|
+
history_workspace = active_workspace
|
|
696
|
+
history_hash = user_hash
|
|
697
|
+
|
|
698
|
+
conditional_values = {}
|
|
699
|
+
if is_launch:
|
|
700
|
+
conditional_values.update({
|
|
701
|
+
'launched_at': cluster_launched_at,
|
|
702
|
+
'last_use': last_use
|
|
703
|
+
})
|
|
704
|
+
|
|
705
|
+
if int(ready) == 1:
|
|
706
|
+
conditional_values.update({
|
|
707
|
+
'cluster_ever_up': 1,
|
|
708
|
+
})
|
|
709
|
+
|
|
710
|
+
if config_hash is not None:
|
|
711
|
+
conditional_values.update({
|
|
712
|
+
'config_hash': config_hash,
|
|
713
|
+
})
|
|
714
|
+
|
|
715
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
716
|
+
# with_for_update() locks the row until commit() or rollback()
|
|
717
|
+
# is called, or until the code escapes the with block.
|
|
718
|
+
cluster_row = session.query(cluster_table).filter_by(
|
|
719
|
+
name=cluster_name).with_for_update().first()
|
|
720
|
+
if (not cluster_row or
|
|
721
|
+
cluster_row.status == status_lib.ClusterStatus.STOPPED.value):
|
|
722
|
+
conditional_values.update({
|
|
723
|
+
'autostop': -1,
|
|
724
|
+
'to_down': 0,
|
|
725
|
+
})
|
|
726
|
+
if not cluster_row or not cluster_row.user_hash:
|
|
727
|
+
conditional_values.update({
|
|
728
|
+
'user_hash': user_hash,
|
|
729
|
+
})
|
|
730
|
+
if not cluster_row or not cluster_row.workspace:
|
|
731
|
+
conditional_values.update({
|
|
732
|
+
'workspace': active_workspace,
|
|
733
|
+
})
|
|
734
|
+
if (is_launch and not cluster_row or
|
|
735
|
+
cluster_row.status != status_lib.ClusterStatus.UP.value):
|
|
736
|
+
conditional_values.update({
|
|
737
|
+
'last_creation_yaml': yaml_utils.dump_yaml_str(task_config)
|
|
738
|
+
if task_config else None,
|
|
739
|
+
'last_creation_command': last_use,
|
|
740
|
+
})
|
|
741
|
+
if provision_log_path is not None:
|
|
742
|
+
conditional_values.update({
|
|
743
|
+
'provision_log_path': provision_log_path,
|
|
744
|
+
})
|
|
745
|
+
|
|
746
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
747
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
748
|
+
insert_func = sqlite.insert
|
|
749
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
750
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
751
|
+
insert_func = postgresql.insert
|
|
752
|
+
else:
|
|
753
|
+
session.rollback()
|
|
754
|
+
raise ValueError('Unsupported database dialect')
|
|
755
|
+
|
|
756
|
+
if existing_cluster_hash is not None:
|
|
757
|
+
count = session.query(cluster_table).filter_by(
|
|
758
|
+
name=cluster_name, cluster_hash=existing_cluster_hash).update({
|
|
759
|
+
**conditional_values, cluster_table.c.handle: handle,
|
|
760
|
+
cluster_table.c.status: status.value,
|
|
761
|
+
cluster_table.c.status_updated_at: status_updated_at
|
|
762
|
+
})
|
|
763
|
+
assert count <= 1
|
|
764
|
+
if count == 0:
|
|
765
|
+
raise ValueError(f'Cluster {cluster_name} with hash '
|
|
766
|
+
f'{existing_cluster_hash} not found.')
|
|
767
|
+
else:
|
|
768
|
+
insert_stmnt = insert_func(cluster_table).values(
|
|
769
|
+
name=cluster_name,
|
|
770
|
+
**conditional_values,
|
|
771
|
+
handle=handle,
|
|
772
|
+
status=status.value,
|
|
773
|
+
# set metadata to server default ('{}')
|
|
774
|
+
# set owner to server default (null)
|
|
775
|
+
cluster_hash=cluster_hash,
|
|
776
|
+
# set storage_mounts_metadata to server default (null)
|
|
777
|
+
status_updated_at=status_updated_at,
|
|
778
|
+
is_managed=int(is_managed),
|
|
779
|
+
)
|
|
780
|
+
insert_or_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
781
|
+
index_elements=[cluster_table.c.name],
|
|
782
|
+
set_={
|
|
783
|
+
**conditional_values,
|
|
784
|
+
cluster_table.c.handle: handle,
|
|
785
|
+
cluster_table.c.status: status.value,
|
|
786
|
+
# do not update metadata value
|
|
787
|
+
# do not update owner value
|
|
788
|
+
cluster_table.c.cluster_hash: cluster_hash,
|
|
789
|
+
# do not update storage_mounts_metadata
|
|
790
|
+
cluster_table.c.status_updated_at: status_updated_at,
|
|
791
|
+
# do not update user_hash
|
|
792
|
+
})
|
|
793
|
+
session.execute(insert_or_update_stmt)
|
|
794
|
+
|
|
795
|
+
# Modify cluster history table
|
|
796
|
+
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
|
797
|
+
launched_resources = getattr(cluster_handle, 'launched_resources', None)
|
|
798
|
+
if cluster_row and cluster_row.workspace:
|
|
799
|
+
history_workspace = cluster_row.workspace
|
|
800
|
+
if cluster_row and cluster_row.user_hash:
|
|
801
|
+
history_hash = cluster_row.user_hash
|
|
802
|
+
creation_info = {}
|
|
803
|
+
if conditional_values.get('last_creation_yaml') is not None:
|
|
804
|
+
creation_info = {
|
|
805
|
+
'last_creation_yaml':
|
|
806
|
+
conditional_values.get('last_creation_yaml'),
|
|
807
|
+
'last_creation_command':
|
|
808
|
+
conditional_values.get('last_creation_command'),
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
# Calculate last_activity_time and launched_at from usage_intervals
|
|
812
|
+
last_activity_time = _get_cluster_last_activity_time(usage_intervals)
|
|
813
|
+
launched_at = _get_cluster_launch_time(usage_intervals)
|
|
814
|
+
|
|
815
|
+
insert_stmnt = insert_func(cluster_history_table).values(
|
|
816
|
+
cluster_hash=cluster_hash,
|
|
817
|
+
name=cluster_name,
|
|
818
|
+
num_nodes=launched_nodes,
|
|
819
|
+
requested_resources=pickle.dumps(requested_resources),
|
|
820
|
+
launched_resources=pickle.dumps(launched_resources),
|
|
821
|
+
usage_intervals=pickle.dumps(usage_intervals),
|
|
822
|
+
user_hash=user_hash,
|
|
823
|
+
workspace=history_workspace,
|
|
824
|
+
provision_log_path=provision_log_path,
|
|
825
|
+
last_activity_time=last_activity_time,
|
|
826
|
+
launched_at=launched_at,
|
|
827
|
+
**creation_info,
|
|
828
|
+
)
|
|
829
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
830
|
+
index_elements=[cluster_history_table.c.cluster_hash],
|
|
831
|
+
set_={
|
|
832
|
+
cluster_history_table.c.name: cluster_name,
|
|
833
|
+
cluster_history_table.c.num_nodes: launched_nodes,
|
|
834
|
+
cluster_history_table.c.requested_resources:
|
|
835
|
+
pickle.dumps(requested_resources),
|
|
836
|
+
cluster_history_table.c.launched_resources:
|
|
837
|
+
pickle.dumps(launched_resources),
|
|
838
|
+
cluster_history_table.c.usage_intervals:
|
|
839
|
+
pickle.dumps(usage_intervals),
|
|
840
|
+
cluster_history_table.c.user_hash: history_hash,
|
|
841
|
+
cluster_history_table.c.workspace: history_workspace,
|
|
842
|
+
cluster_history_table.c.provision_log_path: provision_log_path,
|
|
843
|
+
cluster_history_table.c.last_activity_time: last_activity_time,
|
|
844
|
+
cluster_history_table.c.launched_at: launched_at,
|
|
845
|
+
**creation_info,
|
|
846
|
+
})
|
|
847
|
+
session.execute(do_update_stmt)
|
|
848
|
+
|
|
849
|
+
session.commit()
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
@_init_db
|
|
853
|
+
@metrics_lib.time_me
|
|
854
|
+
def add_cluster_event(cluster_name: str,
|
|
855
|
+
new_status: Optional[status_lib.ClusterStatus],
|
|
856
|
+
reason: str,
|
|
857
|
+
event_type: ClusterEventType,
|
|
858
|
+
nop_if_duplicate: bool = False,
|
|
859
|
+
duplicate_regex: Optional[str] = None,
|
|
860
|
+
expose_duplicate_error: bool = False,
|
|
861
|
+
transitioned_at: Optional[int] = None) -> None:
|
|
862
|
+
"""Add a cluster event.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
cluster_name: Name of the cluster.
|
|
866
|
+
new_status: New status of the cluster.
|
|
867
|
+
reason: Reason for the event.
|
|
868
|
+
event_type: Type of the event.
|
|
869
|
+
nop_if_duplicate: If True, do not add the event if it is a duplicate.
|
|
870
|
+
duplicate_regex: If provided, do not add the event if it matches the
|
|
871
|
+
regex. Only used if nop_if_duplicate is True.
|
|
872
|
+
expose_duplicate_error: If True, raise an error if the event is a
|
|
873
|
+
duplicate. Only used if nop_if_duplicate is True.
|
|
874
|
+
transitioned_at: If provided, use this timestamp for the event.
|
|
875
|
+
"""
|
|
876
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
877
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
878
|
+
if cluster_hash is None:
|
|
879
|
+
logger.debug(f'Hash for cluster {cluster_name} not found. '
|
|
880
|
+
'Skipping event.')
|
|
881
|
+
return
|
|
882
|
+
if transitioned_at is None:
|
|
883
|
+
transitioned_at = int(time.time())
|
|
884
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
885
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
886
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
887
|
+
insert_func = sqlite.insert
|
|
888
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
889
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
890
|
+
insert_func = postgresql.insert
|
|
891
|
+
else:
|
|
892
|
+
session.rollback()
|
|
893
|
+
raise ValueError('Unsupported database dialect')
|
|
894
|
+
|
|
895
|
+
cluster_row = session.query(cluster_table).filter_by(name=cluster_name)
|
|
896
|
+
last_status = cluster_row.first(
|
|
897
|
+
).status if cluster_row and cluster_row.first() is not None else None
|
|
898
|
+
if nop_if_duplicate:
|
|
899
|
+
last_event = get_last_cluster_event(cluster_hash,
|
|
900
|
+
event_type=event_type)
|
|
901
|
+
if duplicate_regex is not None and last_event is not None:
|
|
902
|
+
if re.search(duplicate_regex, last_event):
|
|
903
|
+
return
|
|
904
|
+
elif last_event == reason:
|
|
905
|
+
return
|
|
906
|
+
try:
|
|
907
|
+
request_id = common_utils.get_current_request_id()
|
|
908
|
+
session.execute(
|
|
909
|
+
insert_func(cluster_event_table).values(
|
|
910
|
+
cluster_hash=cluster_hash,
|
|
911
|
+
name=cluster_name,
|
|
912
|
+
starting_status=last_status,
|
|
913
|
+
ending_status=new_status.value if new_status else None,
|
|
914
|
+
reason=reason,
|
|
915
|
+
transitioned_at=transitioned_at,
|
|
916
|
+
type=event_type.value,
|
|
917
|
+
request_id=request_id,
|
|
918
|
+
))
|
|
919
|
+
session.commit()
|
|
920
|
+
except sqlalchemy.exc.IntegrityError as e:
|
|
921
|
+
for msg in _UNIQUE_CONSTRAINT_FAILED_ERROR_MSGS:
|
|
922
|
+
if msg in str(e):
|
|
923
|
+
# This can happen if the cluster event is added twice.
|
|
924
|
+
# We can ignore this error unless the caller requests
|
|
925
|
+
# to expose the error.
|
|
926
|
+
if expose_duplicate_error:
|
|
927
|
+
raise db_utils.UniqueConstraintViolationError(
|
|
928
|
+
value=reason, message=str(e))
|
|
929
|
+
else:
|
|
930
|
+
return
|
|
931
|
+
raise e
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
def get_last_cluster_event(cluster_hash: str,
|
|
935
|
+
event_type: ClusterEventType) -> Optional[str]:
|
|
936
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
937
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
938
|
+
row = session.query(cluster_event_table).filter_by(
|
|
939
|
+
cluster_hash=cluster_hash, type=event_type.value).order_by(
|
|
940
|
+
cluster_event_table.c.transitioned_at.desc()).first()
|
|
941
|
+
if row is None:
|
|
942
|
+
return None
|
|
943
|
+
return row.reason
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _get_last_cluster_event_multiple(
|
|
947
|
+
cluster_hashes: Set[str],
|
|
948
|
+
event_type: ClusterEventType) -> Dict[str, str]:
|
|
949
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
950
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
951
|
+
# Use a subquery to get the latest event for each cluster_hash
|
|
952
|
+
latest_events = session.query(
|
|
953
|
+
cluster_event_table.c.cluster_hash,
|
|
954
|
+
sqlalchemy.func.max(cluster_event_table.c.transitioned_at).label(
|
|
955
|
+
'max_time')).filter(
|
|
956
|
+
cluster_event_table.c.cluster_hash.in_(cluster_hashes),
|
|
957
|
+
cluster_event_table.c.type == event_type.value).group_by(
|
|
958
|
+
cluster_event_table.c.cluster_hash).subquery()
|
|
959
|
+
|
|
960
|
+
# Join with original table to get the full event details
|
|
961
|
+
rows = session.query(cluster_event_table).join(
|
|
962
|
+
latest_events,
|
|
963
|
+
sqlalchemy.and_(
|
|
964
|
+
cluster_event_table.c.cluster_hash ==
|
|
965
|
+
latest_events.c.cluster_hash,
|
|
966
|
+
cluster_event_table.c.transitioned_at ==
|
|
967
|
+
latest_events.c.max_time)).all()
|
|
968
|
+
|
|
969
|
+
return {row.cluster_hash: row.reason for row in rows}
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def cleanup_cluster_events_with_retention(retention_hours: float,
|
|
973
|
+
event_type: ClusterEventType) -> None:
|
|
974
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
975
|
+
# Once for events with type STATUS_CHANGE.
|
|
976
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
977
|
+
query = session.query(cluster_event_table).filter(
|
|
978
|
+
cluster_event_table.c.transitioned_at <
|
|
979
|
+
time.time() - retention_hours * 3600,
|
|
980
|
+
cluster_event_table.c.type == event_type.value)
|
|
981
|
+
logger.debug(f'Deleting {query.count()} cluster events.')
|
|
982
|
+
query.delete()
|
|
983
|
+
session.commit()
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
async def cluster_event_retention_daemon():
|
|
987
|
+
"""Garbage collect cluster events periodically."""
|
|
988
|
+
while True:
|
|
989
|
+
logger.info('Running cluster event retention daemon...')
|
|
990
|
+
# Use the latest config.
|
|
991
|
+
skypilot_config.reload_config()
|
|
992
|
+
retention_hours = skypilot_config.get_nested(
|
|
993
|
+
('api_server', 'cluster_event_retention_hours'),
|
|
994
|
+
DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
|
|
995
|
+
debug_retention_hours = skypilot_config.get_nested(
|
|
996
|
+
('api_server', 'cluster_debug_event_retention_hours'),
|
|
997
|
+
DEBUG_CLUSTER_EVENT_RETENTION_HOURS)
|
|
998
|
+
try:
|
|
999
|
+
if retention_hours >= 0:
|
|
1000
|
+
logger.debug('Cleaning up cluster events with retention '
|
|
1001
|
+
f'{retention_hours} hours.')
|
|
1002
|
+
cleanup_cluster_events_with_retention(
|
|
1003
|
+
retention_hours, ClusterEventType.STATUS_CHANGE)
|
|
1004
|
+
if debug_retention_hours >= 0:
|
|
1005
|
+
logger.debug('Cleaning up debug cluster events with retention '
|
|
1006
|
+
f'{debug_retention_hours} hours.')
|
|
1007
|
+
cleanup_cluster_events_with_retention(debug_retention_hours,
|
|
1008
|
+
ClusterEventType.DEBUG)
|
|
1009
|
+
except asyncio.CancelledError:
|
|
1010
|
+
logger.info('Cluster event retention daemon cancelled')
|
|
1011
|
+
break
|
|
1012
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1013
|
+
logger.error(f'Error running cluster event retention daemon: {e}')
|
|
1014
|
+
|
|
1015
|
+
# Run daemon at most once every hour to avoid too frequent cleanup.
|
|
1016
|
+
sleep_amount = max(
|
|
1017
|
+
min(retention_hours * 3600, debug_retention_hours * 3600),
|
|
1018
|
+
MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
|
|
1019
|
+
await asyncio.sleep(sleep_amount)
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
|
|
1023
|
+
event_type: ClusterEventType) -> List[str]:
|
|
1024
|
+
"""Returns the cluster events for the cluster.
|
|
1025
|
+
|
|
1026
|
+
Args:
|
|
1027
|
+
cluster_name: Name of the cluster. Cannot be specified if cluster_hash
|
|
1028
|
+
is specified.
|
|
1029
|
+
cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
|
|
1030
|
+
is specified.
|
|
1031
|
+
event_type: Type of the event.
|
|
1032
|
+
"""
|
|
1033
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1034
|
+
|
|
1035
|
+
if cluster_name is not None and cluster_hash is not None:
|
|
1036
|
+
raise ValueError('Cannot specify both cluster_name and cluster_hash')
|
|
1037
|
+
if cluster_name is None and cluster_hash is None:
|
|
1038
|
+
raise ValueError('Must specify either cluster_name or cluster_hash')
|
|
1039
|
+
if cluster_name is not None:
|
|
1040
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
1041
|
+
if cluster_hash is None:
|
|
1042
|
+
raise ValueError(f'Hash for cluster {cluster_name} not found.')
|
|
1043
|
+
|
|
1044
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1045
|
+
rows = session.query(cluster_event_table).filter_by(
|
|
1046
|
+
cluster_hash=cluster_hash, type=event_type.value).order_by(
|
|
1047
|
+
cluster_event_table.c.transitioned_at.asc()).all()
|
|
1048
|
+
return [row.reason for row in rows]
|
|
380
1049
|
|
|
381
1050
|
|
|
382
1051
|
def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
|
@@ -391,186 +1060,402 @@ def _get_user_hash_or_current_user(user_hash: Optional[str]) -> str:
|
|
|
391
1060
|
return common_utils.get_user_hash()
|
|
392
1061
|
|
|
393
1062
|
|
|
1063
|
+
@_init_db
|
|
1064
|
+
@metrics_lib.time_me
|
|
394
1065
|
def update_cluster_handle(cluster_name: str,
|
|
395
1066
|
cluster_handle: 'backends.ResourceHandle'):
|
|
1067
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
396
1068
|
handle = pickle.dumps(cluster_handle)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
1069
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1070
|
+
session.query(cluster_table).filter_by(name=cluster_name).update(
|
|
1071
|
+
{cluster_table.c.handle: handle})
|
|
1072
|
+
session.commit()
|
|
400
1073
|
|
|
401
1074
|
|
|
1075
|
+
@_init_db
|
|
1076
|
+
@metrics_lib.time_me
|
|
402
1077
|
def update_last_use(cluster_name: str):
|
|
403
1078
|
"""Updates the last used command for the cluster."""
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
1079
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1080
|
+
session.query(cluster_table).filter_by(name=cluster_name).update(
|
|
1081
|
+
{cluster_table.c.last_use: common_utils.get_current_command()})
|
|
1082
|
+
session.commit()
|
|
407
1083
|
|
|
408
1084
|
|
|
1085
|
+
@_init_db
|
|
1086
|
+
@metrics_lib.time_me
|
|
409
1087
|
def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
410
1088
|
"""Removes cluster_name mapping."""
|
|
1089
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
411
1090
|
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
412
1091
|
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
|
1092
|
+
provision_log_path = get_cluster_provision_log_path(cluster_name)
|
|
413
1093
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
1094
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1095
|
+
# usage_intervals is not None and not empty
|
|
1096
|
+
if usage_intervals:
|
|
1097
|
+
assert cluster_hash is not None, cluster_name
|
|
1098
|
+
start_time = usage_intervals.pop()[0]
|
|
1099
|
+
end_time = int(time.time())
|
|
1100
|
+
usage_intervals.append((start_time, end_time))
|
|
1101
|
+
_set_cluster_usage_intervals(cluster_hash, usage_intervals)
|
|
1102
|
+
|
|
1103
|
+
if provision_log_path:
|
|
1104
|
+
assert cluster_hash is not None, cluster_name
|
|
1105
|
+
session.query(cluster_history_table).filter_by(
|
|
1106
|
+
cluster_hash=cluster_hash
|
|
1107
|
+
).filter(
|
|
1108
|
+
cluster_history_table.c.provision_log_path.is_(None)
|
|
1109
|
+
).update({
|
|
1110
|
+
cluster_history_table.c.provision_log_path: provision_log_path
|
|
1111
|
+
})
|
|
1112
|
+
|
|
1113
|
+
if terminate:
|
|
1114
|
+
session.query(cluster_table).filter_by(name=cluster_name).delete()
|
|
1115
|
+
else:
|
|
1116
|
+
handle = get_handle_from_cluster_name(cluster_name)
|
|
1117
|
+
if handle is None:
|
|
1118
|
+
return
|
|
1119
|
+
# Must invalidate IP list to avoid directly trying to ssh into a
|
|
1120
|
+
# stopped VM, which leads to timeout.
|
|
1121
|
+
if hasattr(handle, 'stable_internal_external_ips'):
|
|
1122
|
+
handle = typing.cast('backends.CloudVmRayResourceHandle',
|
|
1123
|
+
handle)
|
|
1124
|
+
handle.stable_internal_external_ips = None
|
|
1125
|
+
current_time = int(time.time())
|
|
1126
|
+
session.query(cluster_table).filter_by(name=cluster_name).update({
|
|
1127
|
+
cluster_table.c.handle: pickle.dumps(handle),
|
|
1128
|
+
cluster_table.c.status: status_lib.ClusterStatus.STOPPED.value,
|
|
1129
|
+
cluster_table.c.status_updated_at: current_time
|
|
1130
|
+
})
|
|
1131
|
+
session.commit()
|
|
1132
|
+
|
|
1133
|
+
|
|
1134
|
+
@_init_db
|
|
1135
|
+
@metrics_lib.time_me
|
|
446
1136
|
def get_handle_from_cluster_name(
|
|
447
1137
|
cluster_name: str) -> Optional['backends.ResourceHandle']:
|
|
1138
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
448
1139
|
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
1140
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1141
|
+
row = (session.query(
|
|
1142
|
+
cluster_table.c.handle).filter_by(name=cluster_name).first())
|
|
1143
|
+
if row is None:
|
|
1144
|
+
return None
|
|
1145
|
+
return pickle.loads(row.handle)
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
@_init_db
|
|
1149
|
+
@metrics_lib.time_me
|
|
1150
|
+
def get_handles_from_cluster_names(
|
|
1151
|
+
cluster_names: Set[str]
|
|
1152
|
+
) -> Dict[str, Optional['backends.ResourceHandle']]:
|
|
1153
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1154
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1155
|
+
rows = session.query(cluster_table.c.name,
|
|
1156
|
+
cluster_table.c.handle).filter(
|
|
1157
|
+
cluster_table.c.name.in_(cluster_names)).all()
|
|
1158
|
+
return {
|
|
1159
|
+
row.name: pickle.loads(row.handle) if row is not None else None
|
|
1160
|
+
for row in rows
|
|
1161
|
+
}
|
|
454
1162
|
|
|
455
1163
|
|
|
456
|
-
|
|
1164
|
+
@_init_db
|
|
1165
|
+
@metrics_lib.time_me
|
|
1166
|
+
def get_cluster_name_to_handle_map(
|
|
1167
|
+
is_managed: Optional[bool] = None,
|
|
1168
|
+
) -> Dict[str, Optional['backends.ResourceHandle']]:
|
|
1169
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1170
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1171
|
+
query = session.query(cluster_table.c.name, cluster_table.c.handle)
|
|
1172
|
+
if is_managed is not None:
|
|
1173
|
+
query = query.filter(cluster_table.c.is_managed == int(is_managed))
|
|
1174
|
+
rows = query.all()
|
|
1175
|
+
name_to_handle = {}
|
|
1176
|
+
for row in rows:
|
|
1177
|
+
if row.handle and len(row.handle) > 0:
|
|
1178
|
+
name_to_handle[row.name] = pickle.loads(row.handle)
|
|
1179
|
+
else:
|
|
1180
|
+
name_to_handle[row.name] = None
|
|
1181
|
+
return name_to_handle
|
|
1182
|
+
|
|
1183
|
+
|
|
1184
|
+
@_init_db_async
|
|
1185
|
+
@metrics_lib.time_me
|
|
1186
|
+
async def get_status_from_cluster_name_async(
|
|
1187
|
+
cluster_name: str) -> Optional[status_lib.ClusterStatus]:
|
|
1188
|
+
"""Get the status of a cluster."""
|
|
1189
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
457
1190
|
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
1191
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1192
|
+
result = await session.execute(
|
|
1193
|
+
sqlalchemy.select(cluster_table.c.status).where(
|
|
1194
|
+
cluster_table.c.name == cluster_name))
|
|
1195
|
+
row = result.first()
|
|
1196
|
+
|
|
1197
|
+
if row is None:
|
|
1198
|
+
return None
|
|
1199
|
+
return status_lib.ClusterStatus(row[0])
|
|
461
1200
|
|
|
462
1201
|
|
|
1202
|
+
@_init_db
|
|
1203
|
+
@metrics_lib.time_me
|
|
1204
|
+
def get_status_from_cluster_name(
|
|
1205
|
+
cluster_name: str) -> Optional[status_lib.ClusterStatus]:
|
|
1206
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1207
|
+
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
1208
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1209
|
+
row = session.query(
|
|
1210
|
+
cluster_table.c.status).filter_by(name=cluster_name).first()
|
|
1211
|
+
if row is None:
|
|
1212
|
+
return None
|
|
1213
|
+
return status_lib.ClusterStatus[row.status]
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
@_init_db
|
|
1217
|
+
@metrics_lib.time_me
|
|
1218
|
+
def get_glob_cluster_names(
|
|
1219
|
+
cluster_name: str,
|
|
1220
|
+
workspaces_filter: Optional[Set[str]] = None) -> List[str]:
|
|
1221
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1222
|
+
assert cluster_name is not None, 'cluster_name cannot be None'
|
|
1223
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1224
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
1225
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
1226
|
+
query = session.query(cluster_table.c.name).filter(
|
|
1227
|
+
cluster_table.c.name.op('GLOB')(cluster_name))
|
|
1228
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
1229
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
1230
|
+
query = session.query(cluster_table.c.name).filter(
|
|
1231
|
+
cluster_table.c.name.op('SIMILAR TO')(
|
|
1232
|
+
_glob_to_similar(cluster_name)))
|
|
1233
|
+
else:
|
|
1234
|
+
raise ValueError('Unsupported database dialect')
|
|
1235
|
+
if workspaces_filter is not None:
|
|
1236
|
+
query = query.filter(
|
|
1237
|
+
cluster_table.c.workspace.in_(workspaces_filter))
|
|
1238
|
+
rows = query.all()
|
|
1239
|
+
return [row.name for row in rows]
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
@_init_db
|
|
1243
|
+
@metrics_lib.time_me
|
|
463
1244
|
def set_cluster_status(cluster_name: str,
|
|
464
1245
|
status: status_lib.ClusterStatus) -> None:
|
|
1246
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
465
1247
|
current_time = int(time.time())
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
1248
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1249
|
+
count = session.query(cluster_table).filter_by(
|
|
1250
|
+
name=cluster_name).update({
|
|
1251
|
+
cluster_table.c.status: status.value,
|
|
1252
|
+
cluster_table.c.status_updated_at: current_time
|
|
1253
|
+
})
|
|
1254
|
+
session.commit()
|
|
471
1255
|
assert count <= 1, count
|
|
472
1256
|
if count == 0:
|
|
473
1257
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
474
1258
|
|
|
475
1259
|
|
|
1260
|
+
@_init_db
|
|
1261
|
+
@metrics_lib.time_me
|
|
476
1262
|
def set_cluster_autostop_value(cluster_name: str, idle_minutes: int,
|
|
477
1263
|
to_down: bool) -> None:
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
1264
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1265
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1266
|
+
count = session.query(cluster_table).filter_by(
|
|
1267
|
+
name=cluster_name).update({
|
|
1268
|
+
cluster_table.c.autostop: idle_minutes,
|
|
1269
|
+
cluster_table.c.to_down: int(to_down)
|
|
1270
|
+
})
|
|
1271
|
+
session.commit()
|
|
486
1272
|
assert count <= 1, count
|
|
487
1273
|
if count == 0:
|
|
488
1274
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
489
1275
|
|
|
490
1276
|
|
|
1277
|
+
@_init_db
|
|
1278
|
+
@metrics_lib.time_me
|
|
491
1279
|
def get_cluster_launch_time(cluster_name: str) -> Optional[int]:
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
return
|
|
498
|
-
return
|
|
1280
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1281
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1282
|
+
row = session.query(
|
|
1283
|
+
cluster_table.c.launched_at).filter_by(name=cluster_name).first()
|
|
1284
|
+
if row is None or row.launched_at is None:
|
|
1285
|
+
return None
|
|
1286
|
+
return int(row.launched_at)
|
|
499
1287
|
|
|
500
1288
|
|
|
1289
|
+
@_init_db
|
|
1290
|
+
@metrics_lib.time_me
|
|
501
1291
|
def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
1292
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1293
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1294
|
+
row = session.query(
|
|
1295
|
+
cluster_table.c.metadata).filter_by(name=cluster_name).first()
|
|
1296
|
+
if row is None or row.metadata is None:
|
|
1297
|
+
return None
|
|
1298
|
+
return json.loads(row.metadata)
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
@_init_db
|
|
1302
|
+
@metrics_lib.time_me
|
|
1303
|
+
def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
1304
|
+
"""Returns provision_log_path from clusters table, if recorded."""
|
|
1305
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1306
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1307
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
|
1308
|
+
if row is None:
|
|
1309
|
+
return None
|
|
1310
|
+
return getattr(row, 'provision_log_path', None)
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
@_init_db
|
|
1314
|
+
@metrics_lib.time_me
|
|
1315
|
+
def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
|
|
1316
|
+
"""Returns provision_log_path from cluster_history for this name.
|
|
1317
|
+
|
|
1318
|
+
If the cluster currently exists, we use its hash. Otherwise, we look up
|
|
1319
|
+
historical rows by name and choose the most recent one based on
|
|
1320
|
+
usage_intervals.
|
|
1321
|
+
"""
|
|
1322
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1323
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1324
|
+
# Try current cluster first (fast path)
|
|
1325
|
+
cluster_hash = _get_hash_for_existing_cluster(cluster_name)
|
|
1326
|
+
if cluster_hash is not None:
|
|
1327
|
+
row = session.query(cluster_history_table).filter_by(
|
|
1328
|
+
cluster_hash=cluster_hash).first()
|
|
1329
|
+
if row is not None:
|
|
1330
|
+
return getattr(row, 'provision_log_path', None)
|
|
1331
|
+
|
|
1332
|
+
# Fallback: search history by name and pick the latest by
|
|
1333
|
+
# usage_intervals
|
|
1334
|
+
rows = session.query(cluster_history_table).filter_by(
|
|
1335
|
+
name=cluster_name).all()
|
|
1336
|
+
if not rows:
|
|
506
1337
|
return None
|
|
507
|
-
|
|
508
|
-
|
|
1338
|
+
|
|
1339
|
+
def latest_timestamp(usages_bin) -> int:
|
|
1340
|
+
try:
|
|
1341
|
+
intervals = pickle.loads(usages_bin)
|
|
1342
|
+
# intervals: List[Tuple[int, Optional[int]]]
|
|
1343
|
+
if not intervals:
|
|
1344
|
+
return -1
|
|
1345
|
+
_, end = intervals[-1]
|
|
1346
|
+
return end if end is not None else int(time.time())
|
|
1347
|
+
except Exception: # pylint: disable=broad-except
|
|
1348
|
+
return -1
|
|
1349
|
+
|
|
1350
|
+
latest_row = max(rows,
|
|
1351
|
+
key=lambda r: latest_timestamp(r.usage_intervals))
|
|
1352
|
+
return getattr(latest_row, 'provision_log_path', None)
|
|
509
1353
|
|
|
510
1354
|
|
|
1355
|
+
@_init_db
|
|
1356
|
+
@metrics_lib.time_me
|
|
511
1357
|
def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
1358
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1359
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1360
|
+
count = session.query(cluster_table).filter_by(
|
|
1361
|
+
name=cluster_name).update(
|
|
1362
|
+
{cluster_table.c.metadata: json.dumps(metadata)})
|
|
1363
|
+
session.commit()
|
|
518
1364
|
assert count <= 1, count
|
|
519
1365
|
if count == 0:
|
|
520
1366
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
521
1367
|
|
|
522
1368
|
|
|
1369
|
+
@_init_db
|
|
1370
|
+
@metrics_lib.time_me
|
|
523
1371
|
def get_cluster_storage_mounts_metadata(
|
|
524
1372
|
cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
(
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
return None
|
|
1373
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1374
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1375
|
+
row = (session.query(cluster_table.c.storage_mounts_metadata).filter_by(
|
|
1376
|
+
name=cluster_name).first())
|
|
1377
|
+
if row is None or row.storage_mounts_metadata is None:
|
|
1378
|
+
return None
|
|
1379
|
+
return pickle.loads(row.storage_mounts_metadata)
|
|
533
1380
|
|
|
534
1381
|
|
|
1382
|
+
@_init_db
|
|
1383
|
+
@metrics_lib.time_me
|
|
535
1384
|
def set_cluster_storage_mounts_metadata(
|
|
536
1385
|
cluster_name: str, storage_mounts_metadata: Dict[str, Any]) -> None:
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
cluster_name
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
1386
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1387
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1388
|
+
count = session.query(cluster_table).filter_by(
|
|
1389
|
+
name=cluster_name).update({
|
|
1390
|
+
cluster_table.c.storage_mounts_metadata:
|
|
1391
|
+
pickle.dumps(storage_mounts_metadata)
|
|
1392
|
+
})
|
|
1393
|
+
session.commit()
|
|
1394
|
+
assert count <= 1, count
|
|
1395
|
+
if count == 0:
|
|
1396
|
+
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
@_init_db
|
|
1400
|
+
@metrics_lib.time_me
|
|
1401
|
+
def get_cluster_skylet_ssh_tunnel_metadata(
|
|
1402
|
+
cluster_name: str) -> Optional[Tuple[int, int]]:
|
|
1403
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1404
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1405
|
+
row = session.query(
|
|
1406
|
+
cluster_table.c.skylet_ssh_tunnel_metadata).filter_by(
|
|
1407
|
+
name=cluster_name).first()
|
|
1408
|
+
if row is None or row.skylet_ssh_tunnel_metadata is None:
|
|
1409
|
+
return None
|
|
1410
|
+
return pickle.loads(row.skylet_ssh_tunnel_metadata)
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
@_init_db
|
|
1414
|
+
@metrics_lib.time_me
|
|
1415
|
+
def set_cluster_skylet_ssh_tunnel_metadata(
|
|
1416
|
+
cluster_name: str,
|
|
1417
|
+
skylet_ssh_tunnel_metadata: Optional[Tuple[int, int]]) -> None:
|
|
1418
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1419
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1420
|
+
value = pickle.dumps(
|
|
1421
|
+
skylet_ssh_tunnel_metadata
|
|
1422
|
+
) if skylet_ssh_tunnel_metadata is not None else None
|
|
1423
|
+
count = session.query(cluster_table).filter_by(
|
|
1424
|
+
name=cluster_name).update(
|
|
1425
|
+
{cluster_table.c.skylet_ssh_tunnel_metadata: value})
|
|
1426
|
+
session.commit()
|
|
544
1427
|
assert count <= 1, count
|
|
545
1428
|
if count == 0:
|
|
546
1429
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
547
1430
|
|
|
548
1431
|
|
|
1432
|
+
@_init_db
|
|
1433
|
+
@metrics_lib.time_me
|
|
549
1434
|
def _get_cluster_usage_intervals(
|
|
550
1435
|
cluster_hash: Optional[str]
|
|
551
1436
|
) -> Optional[List[Tuple[int, Optional[int]]]]:
|
|
1437
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
552
1438
|
if cluster_hash is None:
|
|
553
1439
|
return None
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
return pickle.loads(usage_intervals)
|
|
561
|
-
return None
|
|
1440
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1441
|
+
row = session.query(cluster_history_table.c.usage_intervals).filter_by(
|
|
1442
|
+
cluster_hash=cluster_hash).first()
|
|
1443
|
+
if row is None or row.usage_intervals is None:
|
|
1444
|
+
return None
|
|
1445
|
+
return pickle.loads(row.usage_intervals)
|
|
562
1446
|
|
|
563
1447
|
|
|
564
|
-
def _get_cluster_launch_time(
|
|
565
|
-
usage_intervals
|
|
1448
|
+
def _get_cluster_launch_time(
|
|
1449
|
+
usage_intervals: Optional[List[Tuple[int,
|
|
1450
|
+
Optional[int]]]]) -> Optional[int]:
|
|
566
1451
|
if usage_intervals is None:
|
|
567
1452
|
return None
|
|
568
1453
|
return usage_intervals[0][0]
|
|
569
1454
|
|
|
570
1455
|
|
|
571
|
-
def _get_cluster_duration(
|
|
1456
|
+
def _get_cluster_duration(
|
|
1457
|
+
usage_intervals: Optional[List[Tuple[int, Optional[int]]]]) -> int:
|
|
572
1458
|
total_duration = 0
|
|
573
|
-
usage_intervals = _get_cluster_usage_intervals(cluster_hash)
|
|
574
1459
|
|
|
575
1460
|
if usage_intervals is None:
|
|
576
1461
|
return total_duration
|
|
@@ -587,60 +1472,89 @@ def _get_cluster_duration(cluster_hash: str) -> int:
|
|
|
587
1472
|
return total_duration
|
|
588
1473
|
|
|
589
1474
|
|
|
1475
|
+
def _get_cluster_last_activity_time(
|
|
1476
|
+
usage_intervals: Optional[List[Tuple[int,
|
|
1477
|
+
Optional[int]]]]) -> Optional[int]:
|
|
1478
|
+
last_activity_time = None
|
|
1479
|
+
if usage_intervals:
|
|
1480
|
+
last_interval = usage_intervals[-1]
|
|
1481
|
+
last_activity_time = (last_interval[1] if last_interval[1] is not None
|
|
1482
|
+
else last_interval[0])
|
|
1483
|
+
return last_activity_time
|
|
1484
|
+
|
|
1485
|
+
|
|
1486
|
+
@_init_db
|
|
1487
|
+
@metrics_lib.time_me
|
|
590
1488
|
def _set_cluster_usage_intervals(
|
|
591
1489
|
cluster_hash: str, usage_intervals: List[Tuple[int,
|
|
592
1490
|
Optional[int]]]) -> None:
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
1491
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1492
|
+
|
|
1493
|
+
# Calculate last_activity_time from usage_intervals
|
|
1494
|
+
last_activity_time = _get_cluster_last_activity_time(usage_intervals)
|
|
1495
|
+
|
|
1496
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1497
|
+
count = session.query(cluster_history_table).filter_by(
|
|
1498
|
+
cluster_hash=cluster_hash).update({
|
|
1499
|
+
cluster_history_table.c.usage_intervals:
|
|
1500
|
+
pickle.dumps(usage_intervals),
|
|
1501
|
+
cluster_history_table.c.last_activity_time: last_activity_time,
|
|
1502
|
+
})
|
|
1503
|
+
session.commit()
|
|
602
1504
|
assert count <= 1, count
|
|
603
1505
|
if count == 0:
|
|
604
1506
|
raise ValueError(f'Cluster hash {cluster_hash} not found.')
|
|
605
1507
|
|
|
606
1508
|
|
|
1509
|
+
@_init_db
|
|
1510
|
+
@metrics_lib.time_me
|
|
607
1511
|
def set_owner_identity_for_cluster(cluster_name: str,
|
|
608
1512
|
owner_identity: Optional[List[str]]) -> None:
|
|
1513
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
609
1514
|
if owner_identity is None:
|
|
610
1515
|
return
|
|
611
1516
|
owner_identity_str = json.dumps(owner_identity)
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
1517
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1518
|
+
count = session.query(cluster_table).filter_by(
|
|
1519
|
+
name=cluster_name).update(
|
|
1520
|
+
{cluster_table.c.owner: owner_identity_str})
|
|
1521
|
+
session.commit()
|
|
617
1522
|
assert count <= 1, count
|
|
618
1523
|
if count == 0:
|
|
619
1524
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
620
1525
|
|
|
621
1526
|
|
|
1527
|
+
@_init_db
|
|
1528
|
+
@metrics_lib.time_me
|
|
622
1529
|
def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
return
|
|
629
|
-
return
|
|
1530
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1531
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1532
|
+
row = (session.query(
|
|
1533
|
+
cluster_table.c.cluster_hash).filter_by(name=cluster_name).first())
|
|
1534
|
+
if row is None or row.cluster_hash is None:
|
|
1535
|
+
return None
|
|
1536
|
+
return row.cluster_hash
|
|
630
1537
|
|
|
631
1538
|
|
|
1539
|
+
@_init_db
|
|
1540
|
+
@metrics_lib.time_me
|
|
632
1541
|
def get_launched_resources_from_cluster_hash(
|
|
633
1542
|
cluster_hash: str) -> Optional[Tuple[int, Any]]:
|
|
1543
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1544
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1545
|
+
row = session.query(
|
|
1546
|
+
cluster_history_table.c.num_nodes,
|
|
1547
|
+
cluster_history_table.c.launched_resources).filter_by(
|
|
1548
|
+
cluster_hash=cluster_hash).first()
|
|
1549
|
+
if row is None:
|
|
1550
|
+
return None
|
|
1551
|
+
num_nodes = row.num_nodes
|
|
1552
|
+
launched_resources = row.launched_resources
|
|
634
1553
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
if num_nodes is None or launched_resources is None:
|
|
640
|
-
return None
|
|
641
|
-
launched_resources = pickle.loads(launched_resources)
|
|
642
|
-
return num_nodes, launched_resources
|
|
643
|
-
return None
|
|
1554
|
+
if num_nodes is None or launched_resources is None:
|
|
1555
|
+
return None
|
|
1556
|
+
launched_resources = pickle.loads(launched_resources)
|
|
1557
|
+
return num_nodes, launched_resources
|
|
644
1558
|
|
|
645
1559
|
|
|
646
1560
|
def _load_owner(record_owner: Optional[str]) -> Optional[List[str]]:
|
|
@@ -671,176 +1585,491 @@ def _load_storage_mounts_metadata(
|
|
|
671
1585
|
return pickle.loads(record_storage_mounts_metadata)
|
|
672
1586
|
|
|
673
1587
|
|
|
1588
|
+
@_init_db
|
|
1589
|
+
@metrics_lib.time_me
|
|
1590
|
+
@context_utils.cancellation_guard
|
|
674
1591
|
def get_cluster_from_name(
|
|
675
|
-
cluster_name: Optional[str]
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
1592
|
+
cluster_name: Optional[str],
|
|
1593
|
+
*,
|
|
1594
|
+
include_user_info: bool = True,
|
|
1595
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
1596
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1597
|
+
query_fields = [
|
|
1598
|
+
cluster_table.c.name,
|
|
1599
|
+
cluster_table.c.launched_at,
|
|
1600
|
+
cluster_table.c.handle,
|
|
1601
|
+
cluster_table.c.last_use,
|
|
1602
|
+
cluster_table.c.status,
|
|
1603
|
+
cluster_table.c.autostop,
|
|
1604
|
+
cluster_table.c.to_down,
|
|
1605
|
+
cluster_table.c.owner,
|
|
1606
|
+
cluster_table.c.metadata,
|
|
1607
|
+
cluster_table.c.cluster_hash,
|
|
1608
|
+
cluster_table.c.cluster_ever_up,
|
|
1609
|
+
cluster_table.c.status_updated_at,
|
|
1610
|
+
cluster_table.c.user_hash,
|
|
1611
|
+
cluster_table.c.config_hash,
|
|
1612
|
+
cluster_table.c.workspace,
|
|
1613
|
+
cluster_table.c.is_managed,
|
|
1614
|
+
]
|
|
1615
|
+
if not summary_response:
|
|
1616
|
+
query_fields.extend([
|
|
1617
|
+
cluster_table.c.last_creation_yaml,
|
|
1618
|
+
cluster_table.c.last_creation_command,
|
|
1619
|
+
])
|
|
1620
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1621
|
+
query = session.query(*query_fields)
|
|
1622
|
+
row = query.filter_by(name=cluster_name).first()
|
|
1623
|
+
if row is None:
|
|
1624
|
+
return None
|
|
1625
|
+
if include_user_info:
|
|
1626
|
+
user_hash = _get_user_hash_or_current_user(row.user_hash)
|
|
1627
|
+
user = get_user(user_hash)
|
|
1628
|
+
user_name = user.name if user is not None else None
|
|
1629
|
+
if not summary_response:
|
|
1630
|
+
last_event = get_last_cluster_event(
|
|
1631
|
+
row.cluster_hash, event_type=ClusterEventType.STATUS_CHANGE)
|
|
1632
|
+
# TODO: use namedtuple instead of dict
|
|
1633
|
+
record = {
|
|
1634
|
+
'name': row.name,
|
|
1635
|
+
'launched_at': row.launched_at,
|
|
1636
|
+
'handle': pickle.loads(row.handle),
|
|
1637
|
+
'last_use': row.last_use,
|
|
1638
|
+
'status': status_lib.ClusterStatus[row.status],
|
|
1639
|
+
'autostop': row.autostop,
|
|
1640
|
+
'to_down': bool(row.to_down),
|
|
1641
|
+
'owner': _load_owner(row.owner),
|
|
1642
|
+
'metadata': json.loads(row.metadata),
|
|
1643
|
+
'cluster_hash': row.cluster_hash,
|
|
1644
|
+
'cluster_ever_up': bool(row.cluster_ever_up),
|
|
1645
|
+
'status_updated_at': row.status_updated_at,
|
|
1646
|
+
'workspace': row.workspace,
|
|
1647
|
+
'is_managed': bool(row.is_managed),
|
|
1648
|
+
'config_hash': row.config_hash,
|
|
1649
|
+
}
|
|
1650
|
+
if not summary_response:
|
|
1651
|
+
record['last_creation_yaml'] = row.last_creation_yaml
|
|
1652
|
+
record['last_creation_command'] = row.last_creation_command
|
|
1653
|
+
record['last_event'] = last_event
|
|
1654
|
+
if include_user_info:
|
|
1655
|
+
record['user_hash'] = user_hash
|
|
1656
|
+
record['user_name'] = user_name
|
|
1657
|
+
|
|
1658
|
+
return record
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
@_init_db
|
|
1662
|
+
@metrics_lib.time_me
|
|
1663
|
+
@context_utils.cancellation_guard
|
|
1664
|
+
def cluster_with_name_exists(cluster_name: str) -> bool:
|
|
1665
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1666
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1667
|
+
row = session.query(
|
|
1668
|
+
cluster_table.c.name).filter_by(name=cluster_name).first()
|
|
1669
|
+
if row is None:
|
|
1670
|
+
return False
|
|
1671
|
+
return True
|
|
1672
|
+
|
|
1673
|
+
|
|
1674
|
+
@_init_db
|
|
1675
|
+
@metrics_lib.time_me
|
|
1676
|
+
def get_clusters(
|
|
1677
|
+
*, # keyword only separator
|
|
1678
|
+
exclude_managed_clusters: bool = False,
|
|
1679
|
+
workspaces_filter: Optional[Dict[str, Any]] = None,
|
|
1680
|
+
user_hashes_filter: Optional[Set[str]] = None,
|
|
1681
|
+
cluster_names: Optional[List[str]] = None,
|
|
1682
|
+
summary_response: bool = False,
|
|
1683
|
+
) -> List[Dict[str, Any]]:
|
|
1684
|
+
"""Get clusters from the database.
|
|
712
1685
|
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
1686
|
+
Args:
|
|
1687
|
+
exclude_managed_clusters: If True, exclude clusters that have
|
|
1688
|
+
is_managed field set to True.
|
|
1689
|
+
workspaces_filter: If specified, only include clusters
|
|
1690
|
+
that has workspace field set to one of the values.
|
|
1691
|
+
user_hashes_filter: If specified, only include clusters
|
|
1692
|
+
that has user_hash field set to one of the values.
|
|
1693
|
+
cluster_names: If specified, only include clusters
|
|
1694
|
+
that has name field set to one of the values.
|
|
1695
|
+
"""
|
|
1696
|
+
# is a cluster has a null user_hash,
|
|
1697
|
+
# we treat it as belonging to the current user.
|
|
1698
|
+
current_user_hash = common_utils.get_user_hash()
|
|
1699
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1700
|
+
query_fields = [
|
|
1701
|
+
cluster_table.c.name,
|
|
1702
|
+
cluster_table.c.launched_at,
|
|
1703
|
+
cluster_table.c.handle,
|
|
1704
|
+
cluster_table.c.status,
|
|
1705
|
+
cluster_table.c.autostop,
|
|
1706
|
+
cluster_table.c.to_down,
|
|
1707
|
+
cluster_table.c.cluster_hash,
|
|
1708
|
+
cluster_table.c.cluster_ever_up,
|
|
1709
|
+
cluster_table.c.user_hash,
|
|
1710
|
+
cluster_table.c.workspace,
|
|
1711
|
+
user_table.c.name.label('user_name'),
|
|
1712
|
+
]
|
|
1713
|
+
if not summary_response:
|
|
1714
|
+
query_fields.extend([
|
|
1715
|
+
cluster_table.c.last_creation_yaml,
|
|
1716
|
+
cluster_table.c.last_creation_command,
|
|
1717
|
+
cluster_table.c.config_hash,
|
|
1718
|
+
cluster_table.c.owner,
|
|
1719
|
+
cluster_table.c.metadata,
|
|
1720
|
+
cluster_table.c.last_use,
|
|
1721
|
+
cluster_table.c.status_updated_at,
|
|
1722
|
+
])
|
|
1723
|
+
if not exclude_managed_clusters:
|
|
1724
|
+
query_fields.append(cluster_table.c.is_managed)
|
|
1725
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1726
|
+
query = session.query(*query_fields).outerjoin(
|
|
1727
|
+
user_table, cluster_table.c.user_hash == user_table.c.id)
|
|
1728
|
+
if exclude_managed_clusters:
|
|
1729
|
+
query = query.filter(cluster_table.c.is_managed == int(False))
|
|
1730
|
+
if workspaces_filter is not None:
|
|
1731
|
+
query = query.filter(
|
|
1732
|
+
cluster_table.c.workspace.in_(workspaces_filter))
|
|
1733
|
+
if user_hashes_filter is not None:
|
|
1734
|
+
if current_user_hash in user_hashes_filter:
|
|
1735
|
+
# backwards compatibility for old clusters.
|
|
1736
|
+
# If current_user_hash is in user_hashes_filter, we include
|
|
1737
|
+
# clusters that have a null user_hash.
|
|
1738
|
+
query = query.filter(
|
|
1739
|
+
(cluster_table.c.user_hash.in_(user_hashes_filter) |
|
|
1740
|
+
(cluster_table.c.user_hash is None)))
|
|
1741
|
+
else:
|
|
1742
|
+
query = query.filter(
|
|
1743
|
+
cluster_table.c.user_hash.in_(user_hashes_filter))
|
|
1744
|
+
if cluster_names is not None:
|
|
1745
|
+
query = query.filter(cluster_table.c.name.in_(cluster_names))
|
|
1746
|
+
query = query.order_by(sqlalchemy.desc(cluster_table.c.launched_at))
|
|
1747
|
+
rows = query.all()
|
|
719
1748
|
records = []
|
|
1749
|
+
|
|
1750
|
+
# Check if we need to fetch the current user's name,
|
|
1751
|
+
# for backwards compatibility, if user_hash is None.
|
|
1752
|
+
current_user_name = None
|
|
1753
|
+
needs_current_user = any(row.user_hash is None for row in rows)
|
|
1754
|
+
if needs_current_user:
|
|
1755
|
+
current_user = get_user(current_user_hash)
|
|
1756
|
+
current_user_name = (current_user.name
|
|
1757
|
+
if current_user is not None else None)
|
|
1758
|
+
|
|
1759
|
+
# get last cluster event for each row
|
|
1760
|
+
if not summary_response:
|
|
1761
|
+
cluster_hashes = {row.cluster_hash for row in rows}
|
|
1762
|
+
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1763
|
+
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1764
|
+
|
|
720
1765
|
for row in rows:
|
|
721
|
-
(name, launched_at, handle, last_use, status, autostop, metadata,
|
|
722
|
-
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
|
723
|
-
status_updated_at, config_hash, user_hash) = row
|
|
724
|
-
user_hash = _get_user_hash_or_current_user(user_hash)
|
|
725
1766
|
# TODO: use namedtuple instead of dict
|
|
726
1767
|
record = {
|
|
727
|
-
'name': name,
|
|
728
|
-
'launched_at': launched_at,
|
|
729
|
-
'handle': pickle.loads(handle),
|
|
730
|
-
'
|
|
731
|
-
'
|
|
732
|
-
'
|
|
733
|
-
'
|
|
734
|
-
'
|
|
735
|
-
'
|
|
736
|
-
|
|
737
|
-
'
|
|
738
|
-
|
|
739
|
-
'
|
|
740
|
-
'
|
|
741
|
-
|
|
742
|
-
'user_name': get_user(user_hash).name,
|
|
743
|
-
'config_hash': config_hash,
|
|
1768
|
+
'name': row.name,
|
|
1769
|
+
'launched_at': row.launched_at,
|
|
1770
|
+
'handle': pickle.loads(row.handle),
|
|
1771
|
+
'status': status_lib.ClusterStatus[row.status],
|
|
1772
|
+
'autostop': row.autostop,
|
|
1773
|
+
'to_down': bool(row.to_down),
|
|
1774
|
+
'cluster_hash': row.cluster_hash,
|
|
1775
|
+
'cluster_ever_up': bool(row.cluster_ever_up),
|
|
1776
|
+
'user_hash': (row.user_hash
|
|
1777
|
+
if row.user_hash is not None else current_user_hash),
|
|
1778
|
+
'user_name': (row.user_name
|
|
1779
|
+
if row.user_name is not None else current_user_name),
|
|
1780
|
+
'workspace': row.workspace,
|
|
1781
|
+
'is_managed': False
|
|
1782
|
+
if exclude_managed_clusters else bool(row.is_managed),
|
|
744
1783
|
}
|
|
1784
|
+
if not summary_response:
|
|
1785
|
+
record['last_creation_yaml'] = row.last_creation_yaml
|
|
1786
|
+
record['last_creation_command'] = row.last_creation_command
|
|
1787
|
+
record['last_event'] = last_cluster_event_dict.get(
|
|
1788
|
+
row.cluster_hash, None)
|
|
1789
|
+
record['config_hash'] = row.config_hash
|
|
1790
|
+
record['owner'] = _load_owner(row.owner)
|
|
1791
|
+
record['metadata'] = json.loads(row.metadata)
|
|
1792
|
+
record['last_use'] = row.last_use
|
|
1793
|
+
record['status_updated_at'] = row.status_updated_at
|
|
745
1794
|
|
|
746
1795
|
records.append(record)
|
|
747
1796
|
return records
|
|
748
1797
|
|
|
749
1798
|
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
1799
|
+
@_init_db
|
|
1800
|
+
@metrics_lib.time_me
|
|
1801
|
+
def get_cluster_names(exclude_managed_clusters: bool = False,) -> List[str]:
|
|
1802
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1803
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1804
|
+
query = session.query(cluster_table.c.name)
|
|
1805
|
+
if exclude_managed_clusters:
|
|
1806
|
+
query = query.filter(cluster_table.c.is_managed == int(False))
|
|
1807
|
+
rows = query.all()
|
|
1808
|
+
return [row[0] for row in rows]
|
|
758
1809
|
|
|
759
|
-
# '(cluster_hash, name, num_nodes, requested_resources, '
|
|
760
|
-
# 'launched_resources, usage_intervals) '
|
|
761
|
-
records = []
|
|
762
1810
|
|
|
763
|
-
|
|
764
|
-
|
|
1811
|
+
@_init_db
|
|
1812
|
+
@metrics_lib.time_me
|
|
1813
|
+
def get_clusters_from_history(
|
|
1814
|
+
days: Optional[int] = None,
|
|
1815
|
+
abbreviate_response: bool = False,
|
|
1816
|
+
cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
1817
|
+
"""Get cluster reports from history.
|
|
765
1818
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
usage_intervals,
|
|
772
|
-
status,
|
|
773
|
-
user_hash,
|
|
774
|
-
) = row[:7]
|
|
775
|
-
user_hash = _get_user_hash_or_current_user(user_hash)
|
|
1819
|
+
Args:
|
|
1820
|
+
days: If specified, only include historical clusters (those not
|
|
1821
|
+
currently active) that were last used within the past 'days'
|
|
1822
|
+
days. Active clusters are always included regardless of this
|
|
1823
|
+
parameter.
|
|
776
1824
|
|
|
777
|
-
|
|
778
|
-
|
|
1825
|
+
Returns:
|
|
1826
|
+
List of cluster records with history information.
|
|
1827
|
+
"""
|
|
1828
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1829
|
+
|
|
1830
|
+
current_user_hash = common_utils.get_user_hash()
|
|
1831
|
+
|
|
1832
|
+
# Prepare filtering parameters
|
|
1833
|
+
cutoff_time = 0
|
|
1834
|
+
if days is not None:
|
|
1835
|
+
cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
|
|
1836
|
+
|
|
1837
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1838
|
+
# Explicitly select columns from both tables to avoid ambiguity
|
|
1839
|
+
if abbreviate_response:
|
|
1840
|
+
query = session.query(
|
|
1841
|
+
cluster_history_table.c.cluster_hash,
|
|
1842
|
+
cluster_history_table.c.name, cluster_history_table.c.num_nodes,
|
|
1843
|
+
cluster_history_table.c.launched_resources,
|
|
1844
|
+
cluster_history_table.c.usage_intervals,
|
|
1845
|
+
cluster_history_table.c.user_hash,
|
|
1846
|
+
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1847
|
+
cluster_history_table.c.last_activity_time,
|
|
1848
|
+
cluster_history_table.c.launched_at, cluster_table.c.status,
|
|
1849
|
+
cluster_table.c.workspace)
|
|
1850
|
+
else:
|
|
1851
|
+
query = session.query(
|
|
1852
|
+
cluster_history_table.c.cluster_hash,
|
|
1853
|
+
cluster_history_table.c.name, cluster_history_table.c.num_nodes,
|
|
1854
|
+
cluster_history_table.c.launched_resources,
|
|
1855
|
+
cluster_history_table.c.usage_intervals,
|
|
1856
|
+
cluster_history_table.c.user_hash,
|
|
1857
|
+
cluster_history_table.c.last_creation_yaml,
|
|
1858
|
+
cluster_history_table.c.last_creation_command,
|
|
1859
|
+
cluster_history_table.c.workspace.label('history_workspace'),
|
|
1860
|
+
cluster_history_table.c.last_activity_time,
|
|
1861
|
+
cluster_history_table.c.launched_at, cluster_table.c.status,
|
|
1862
|
+
cluster_table.c.workspace)
|
|
1863
|
+
|
|
1864
|
+
query = query.select_from(
|
|
1865
|
+
cluster_history_table.join(cluster_table,
|
|
1866
|
+
cluster_history_table.c.cluster_hash ==
|
|
1867
|
+
cluster_table.c.cluster_hash,
|
|
1868
|
+
isouter=True))
|
|
1869
|
+
|
|
1870
|
+
# Only include clusters that are either active (status is not None)
|
|
1871
|
+
# or are within the cutoff time (cutoff_time <= last_activity_time).
|
|
1872
|
+
# If days is not specified, we include all clusters by setting
|
|
1873
|
+
# cutoff_time to 0.
|
|
1874
|
+
query = query.filter(
|
|
1875
|
+
(cluster_table.c.status.isnot(None) |
|
|
1876
|
+
(cluster_history_table.c.last_activity_time >= cutoff_time)))
|
|
1877
|
+
|
|
1878
|
+
# Order by launched_at descending (most recent first)
|
|
1879
|
+
query = query.order_by(
|
|
1880
|
+
sqlalchemy.desc(cluster_history_table.c.launched_at))
|
|
1881
|
+
|
|
1882
|
+
if cluster_hashes is not None:
|
|
1883
|
+
query = query.filter(
|
|
1884
|
+
cluster_history_table.c.cluster_hash.in_(cluster_hashes))
|
|
1885
|
+
rows = query.all()
|
|
1886
|
+
|
|
1887
|
+
usage_intervals_dict = {}
|
|
1888
|
+
row_to_user_hash = {}
|
|
1889
|
+
for row in rows:
|
|
1890
|
+
row_usage_intervals: List[Tuple[int, Optional[int]]] = []
|
|
1891
|
+
if row.usage_intervals:
|
|
1892
|
+
try:
|
|
1893
|
+
row_usage_intervals = pickle.loads(row.usage_intervals)
|
|
1894
|
+
except (pickle.PickleError, AttributeError):
|
|
1895
|
+
pass
|
|
1896
|
+
usage_intervals_dict[row.cluster_hash] = row_usage_intervals
|
|
1897
|
+
user_hash = (row.user_hash
|
|
1898
|
+
if row.user_hash is not None else current_user_hash)
|
|
1899
|
+
row_to_user_hash[row.cluster_hash] = user_hash
|
|
1900
|
+
|
|
1901
|
+
user_hashes = set(row_to_user_hash.values())
|
|
1902
|
+
user_hash_to_user = get_users(user_hashes)
|
|
1903
|
+
cluster_hashes = set(row_to_user_hash.keys())
|
|
1904
|
+
if not abbreviate_response:
|
|
1905
|
+
last_cluster_event_dict = _get_last_cluster_event_multiple(
|
|
1906
|
+
cluster_hashes, ClusterEventType.STATUS_CHANGE)
|
|
1907
|
+
|
|
1908
|
+
records = []
|
|
1909
|
+
for row in rows:
|
|
1910
|
+
user_hash = row_to_user_hash[row.cluster_hash]
|
|
1911
|
+
user = user_hash_to_user.get(user_hash, None)
|
|
1912
|
+
user_name = user.name if user is not None else None
|
|
1913
|
+
if not abbreviate_response:
|
|
1914
|
+
last_event = last_cluster_event_dict.get(row.cluster_hash, None)
|
|
1915
|
+
launched_at = row.launched_at
|
|
1916
|
+
usage_intervals: Optional[List[Tuple[
|
|
1917
|
+
int,
|
|
1918
|
+
Optional[int]]]] = usage_intervals_dict.get(row.cluster_hash, None)
|
|
1919
|
+
duration = _get_cluster_duration(usage_intervals)
|
|
1920
|
+
|
|
1921
|
+
# Parse status
|
|
1922
|
+
status = None
|
|
1923
|
+
if row.status:
|
|
1924
|
+
status = status_lib.ClusterStatus[row.status]
|
|
1925
|
+
|
|
1926
|
+
# Parse launched resources safely
|
|
1927
|
+
launched_resources = None
|
|
1928
|
+
if row.launched_resources:
|
|
1929
|
+
try:
|
|
1930
|
+
launched_resources = pickle.loads(row.launched_resources)
|
|
1931
|
+
except (pickle.PickleError, AttributeError):
|
|
1932
|
+
launched_resources = None
|
|
1933
|
+
|
|
1934
|
+
workspace = (row.history_workspace
|
|
1935
|
+
if row.history_workspace else row.workspace)
|
|
779
1936
|
|
|
780
1937
|
record = {
|
|
781
|
-
'name': name,
|
|
782
|
-
'launched_at':
|
|
783
|
-
'duration':
|
|
784
|
-
'num_nodes': num_nodes,
|
|
785
|
-
'resources':
|
|
786
|
-
'cluster_hash': cluster_hash,
|
|
787
|
-
'usage_intervals':
|
|
1938
|
+
'name': row.name,
|
|
1939
|
+
'launched_at': launched_at,
|
|
1940
|
+
'duration': duration,
|
|
1941
|
+
'num_nodes': row.num_nodes,
|
|
1942
|
+
'resources': launched_resources,
|
|
1943
|
+
'cluster_hash': row.cluster_hash,
|
|
1944
|
+
'usage_intervals': usage_intervals,
|
|
788
1945
|
'status': status,
|
|
789
1946
|
'user_hash': user_hash,
|
|
1947
|
+
'user_name': user_name,
|
|
1948
|
+
'workspace': workspace,
|
|
790
1949
|
}
|
|
1950
|
+
if not abbreviate_response:
|
|
1951
|
+
record['last_creation_yaml'] = row.last_creation_yaml
|
|
1952
|
+
record['last_creation_command'] = row.last_creation_command
|
|
1953
|
+
record['last_event'] = last_event
|
|
791
1954
|
|
|
792
1955
|
records.append(record)
|
|
793
1956
|
|
|
794
1957
|
# sort by launch time, descending in recency
|
|
795
|
-
records = sorted(records, key=lambda record: -record['launched_at'])
|
|
1958
|
+
records = sorted(records, key=lambda record: -(record['launched_at'] or 0))
|
|
796
1959
|
return records
|
|
797
1960
|
|
|
798
1961
|
|
|
1962
|
+
@_init_db
|
|
1963
|
+
@metrics_lib.time_me
|
|
799
1964
|
def get_cluster_names_start_with(starts_with: str) -> List[str]:
|
|
800
|
-
|
|
801
|
-
|
|
1965
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1966
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1967
|
+
rows = session.query(cluster_table.c.name).filter(
|
|
1968
|
+
cluster_table.c.name.like(f'{starts_with}%')).all()
|
|
802
1969
|
return [row[0] for row in rows]
|
|
803
1970
|
|
|
804
1971
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
1972
|
+
@_init_db
|
|
1973
|
+
@metrics_lib.time_me
|
|
1974
|
+
def get_cached_enabled_clouds(cloud_capability: 'cloud.CloudCapability',
|
|
1975
|
+
workspace: str) -> List['clouds.Cloud']:
|
|
1976
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1977
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1978
|
+
row = session.query(config_table).filter_by(
|
|
1979
|
+
key=_get_enabled_clouds_key(cloud_capability, workspace)).first()
|
|
810
1980
|
ret = []
|
|
811
|
-
|
|
812
|
-
ret = json.loads(value)
|
|
813
|
-
break
|
|
1981
|
+
if row:
|
|
1982
|
+
ret = json.loads(row.value)
|
|
814
1983
|
enabled_clouds: List['clouds.Cloud'] = []
|
|
815
1984
|
for c in ret:
|
|
816
1985
|
try:
|
|
817
1986
|
cloud = registry.CLOUD_REGISTRY.from_str(c)
|
|
818
1987
|
except ValueError:
|
|
819
|
-
# Handle the case for the clouds whose support has been
|
|
820
|
-
# SkyPilot, e.g., 'local' was a cloud in the past
|
|
821
|
-
# in the database for users before #3037.
|
|
822
|
-
# clouds and continue.
|
|
1988
|
+
# Handle the case for the clouds whose support has been
|
|
1989
|
+
# removed from SkyPilot, e.g., 'local' was a cloud in the past
|
|
1990
|
+
# and may be stored in the database for users before #3037.
|
|
1991
|
+
# We should ignore removed clouds and continue.
|
|
823
1992
|
continue
|
|
824
1993
|
if cloud is not None:
|
|
825
1994
|
enabled_clouds.append(cloud)
|
|
826
1995
|
return enabled_clouds
|
|
827
1996
|
|
|
828
1997
|
|
|
1998
|
+
@_init_db
|
|
1999
|
+
@metrics_lib.time_me
|
|
829
2000
|
def set_enabled_clouds(enabled_clouds: List[str],
|
|
830
|
-
cloud_capability: 'cloud.CloudCapability'
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
2001
|
+
cloud_capability: 'cloud.CloudCapability',
|
|
2002
|
+
workspace: str) -> None:
|
|
2003
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2004
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2005
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2006
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2007
|
+
insert_func = sqlite.insert
|
|
2008
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2009
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2010
|
+
insert_func = postgresql.insert
|
|
2011
|
+
else:
|
|
2012
|
+
raise ValueError('Unsupported database dialect')
|
|
2013
|
+
insert_stmnt = insert_func(config_table).values(
|
|
2014
|
+
key=_get_enabled_clouds_key(cloud_capability, workspace),
|
|
2015
|
+
value=json.dumps(enabled_clouds))
|
|
2016
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
2017
|
+
index_elements=[config_table.c.key],
|
|
2018
|
+
set_={config_table.c.value: json.dumps(enabled_clouds)})
|
|
2019
|
+
session.execute(do_update_stmt)
|
|
2020
|
+
session.commit()
|
|
2021
|
+
|
|
2022
|
+
|
|
2023
|
+
def _get_enabled_clouds_key(cloud_capability: 'cloud.CloudCapability',
|
|
2024
|
+
workspace: str) -> str:
|
|
2025
|
+
return _ENABLED_CLOUDS_KEY_PREFIX + workspace + '_' + cloud_capability.value
|
|
2026
|
+
|
|
2027
|
+
|
|
2028
|
+
@_init_db
|
|
2029
|
+
@metrics_lib.time_me
|
|
2030
|
+
def get_allowed_clouds(workspace: str) -> List[str]:
|
|
2031
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2032
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2033
|
+
row = session.query(config_table).filter_by(
|
|
2034
|
+
key=_get_allowed_clouds_key(workspace)).first()
|
|
2035
|
+
if row:
|
|
2036
|
+
return json.loads(row.value)
|
|
2037
|
+
return []
|
|
2038
|
+
|
|
2039
|
+
|
|
2040
|
+
@_init_db
|
|
2041
|
+
@metrics_lib.time_me
|
|
2042
|
+
def set_allowed_clouds(allowed_clouds: List[str], workspace: str) -> None:
|
|
2043
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2044
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2045
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2046
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2047
|
+
insert_func = sqlite.insert
|
|
2048
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2049
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2050
|
+
insert_func = postgresql.insert
|
|
2051
|
+
else:
|
|
2052
|
+
raise ValueError('Unsupported database dialect')
|
|
2053
|
+
insert_stmnt = insert_func(config_table).values(
|
|
2054
|
+
key=_get_allowed_clouds_key(workspace),
|
|
2055
|
+
value=json.dumps(allowed_clouds))
|
|
2056
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
2057
|
+
index_elements=[config_table.c.key],
|
|
2058
|
+
set_={config_table.c.value: json.dumps(allowed_clouds)})
|
|
2059
|
+
session.execute(do_update_stmt)
|
|
2060
|
+
session.commit()
|
|
2061
|
+
|
|
2062
|
+
|
|
2063
|
+
def _get_allowed_clouds_key(workspace: str) -> str:
|
|
2064
|
+
return _ALLOWED_CLOUDS_KEY_PREFIX + workspace
|
|
2065
|
+
|
|
2066
|
+
|
|
2067
|
+
@_init_db
|
|
2068
|
+
@metrics_lib.time_me
|
|
841
2069
|
def add_or_update_storage(storage_name: str,
|
|
842
2070
|
storage_handle: 'Storage.StorageMetadata',
|
|
843
2071
|
storage_status: status_lib.StorageStatus):
|
|
2072
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
844
2073
|
storage_launched_at = int(time.time())
|
|
845
2074
|
handle = pickle.dumps(storage_handle)
|
|
846
2075
|
last_use = common_utils.get_current_command()
|
|
@@ -851,89 +2080,648 @@ def add_or_update_storage(storage_name: str,
|
|
|
851
2080
|
if not status_check(storage_status):
|
|
852
2081
|
raise ValueError(f'Error in updating global state. Storage Status '
|
|
853
2082
|
f'{storage_status} is passed in incorrectly')
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
2083
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2084
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2085
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2086
|
+
insert_func = sqlite.insert
|
|
2087
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2088
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2089
|
+
insert_func = postgresql.insert
|
|
2090
|
+
else:
|
|
2091
|
+
raise ValueError('Unsupported database dialect')
|
|
2092
|
+
insert_stmnt = insert_func(storage_table).values(
|
|
2093
|
+
name=storage_name,
|
|
2094
|
+
handle=handle,
|
|
2095
|
+
last_use=last_use,
|
|
2096
|
+
launched_at=storage_launched_at,
|
|
2097
|
+
status=storage_status.value)
|
|
2098
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
2099
|
+
index_elements=[storage_table.c.name],
|
|
2100
|
+
set_={
|
|
2101
|
+
storage_table.c.handle: handle,
|
|
2102
|
+
storage_table.c.last_use: last_use,
|
|
2103
|
+
storage_table.c.launched_at: storage_launched_at,
|
|
2104
|
+
storage_table.c.status: storage_status.value
|
|
2105
|
+
})
|
|
2106
|
+
session.execute(do_update_stmt)
|
|
2107
|
+
session.commit()
|
|
2108
|
+
|
|
2109
|
+
|
|
2110
|
+
@_init_db
|
|
2111
|
+
@metrics_lib.time_me
|
|
860
2112
|
def remove_storage(storage_name: str):
|
|
861
2113
|
"""Removes Storage from Database"""
|
|
862
|
-
|
|
863
|
-
|
|
2114
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2115
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2116
|
+
session.query(storage_table).filter_by(name=storage_name).delete()
|
|
2117
|
+
session.commit()
|
|
864
2118
|
|
|
865
2119
|
|
|
2120
|
+
@_init_db
|
|
2121
|
+
@metrics_lib.time_me
|
|
866
2122
|
def set_storage_status(storage_name: str,
|
|
867
2123
|
status: status_lib.StorageStatus) -> None:
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
_DB.conn.commit()
|
|
2124
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2125
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2126
|
+
count = session.query(storage_table).filter_by(
|
|
2127
|
+
name=storage_name).update({storage_table.c.status: status.value})
|
|
2128
|
+
session.commit()
|
|
874
2129
|
assert count <= 1, count
|
|
875
2130
|
if count == 0:
|
|
876
2131
|
raise ValueError(f'Storage {storage_name} not found.')
|
|
877
2132
|
|
|
878
2133
|
|
|
2134
|
+
@_init_db
|
|
2135
|
+
@metrics_lib.time_me
|
|
879
2136
|
def get_storage_status(storage_name: str) -> Optional[status_lib.StorageStatus]:
|
|
2137
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
880
2138
|
assert storage_name is not None, 'storage_name cannot be None'
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
return status_lib.StorageStatus[status]
|
|
2139
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2140
|
+
row = session.query(storage_table).filter_by(name=storage_name).first()
|
|
2141
|
+
if row:
|
|
2142
|
+
return status_lib.StorageStatus[row.status]
|
|
885
2143
|
return None
|
|
886
2144
|
|
|
887
2145
|
|
|
2146
|
+
@_init_db
|
|
2147
|
+
@metrics_lib.time_me
|
|
888
2148
|
def set_storage_handle(storage_name: str,
|
|
889
2149
|
handle: 'Storage.StorageMetadata') -> None:
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
2150
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2151
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2152
|
+
count = session.query(storage_table).filter_by(
|
|
2153
|
+
name=storage_name).update(
|
|
2154
|
+
{storage_table.c.handle: pickle.dumps(handle)})
|
|
2155
|
+
session.commit()
|
|
896
2156
|
assert count <= 1, count
|
|
897
2157
|
if count == 0:
|
|
898
2158
|
raise ValueError(f'Storage{storage_name} not found.')
|
|
899
2159
|
|
|
900
2160
|
|
|
2161
|
+
@_init_db
|
|
2162
|
+
@metrics_lib.time_me
|
|
901
2163
|
def get_handle_from_storage_name(
|
|
902
2164
|
storage_name: Optional[str]) -> Optional['Storage.StorageMetadata']:
|
|
2165
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
903
2166
|
if storage_name is None:
|
|
904
2167
|
return None
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
return None
|
|
910
|
-
return pickle.loads(handle)
|
|
2168
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2169
|
+
row = session.query(storage_table).filter_by(name=storage_name).first()
|
|
2170
|
+
if row:
|
|
2171
|
+
return pickle.loads(row.handle)
|
|
911
2172
|
return None
|
|
912
2173
|
|
|
913
2174
|
|
|
2175
|
+
@_init_db
|
|
2176
|
+
@metrics_lib.time_me
|
|
914
2177
|
def get_glob_storage_name(storage_name: str) -> List[str]:
|
|
2178
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
915
2179
|
assert storage_name is not None, 'storage_name cannot be None'
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
2180
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2181
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2182
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2183
|
+
rows = session.query(storage_table).filter(
|
|
2184
|
+
storage_table.c.name.op('GLOB')(storage_name)).all()
|
|
2185
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2186
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2187
|
+
rows = session.query(storage_table).filter(
|
|
2188
|
+
storage_table.c.name.op('SIMILAR TO')(
|
|
2189
|
+
_glob_to_similar(storage_name))).all()
|
|
2190
|
+
else:
|
|
2191
|
+
raise ValueError('Unsupported database dialect')
|
|
2192
|
+
return [row.name for row in rows]
|
|
2193
|
+
|
|
2194
|
+
|
|
2195
|
+
@_init_db
|
|
2196
|
+
@metrics_lib.time_me
|
|
921
2197
|
def get_storage_names_start_with(starts_with: str) -> List[str]:
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
2198
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2199
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2200
|
+
rows = session.query(storage_table).filter(
|
|
2201
|
+
storage_table.c.name.like(f'{starts_with}%')).all()
|
|
2202
|
+
return [row.name for row in rows]
|
|
925
2203
|
|
|
926
2204
|
|
|
2205
|
+
@_init_db
|
|
2206
|
+
@metrics_lib.time_me
|
|
927
2207
|
def get_storage() -> List[Dict[str, Any]]:
|
|
928
|
-
|
|
2208
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2209
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2210
|
+
rows = session.query(storage_table).all()
|
|
929
2211
|
records = []
|
|
930
|
-
for
|
|
2212
|
+
for row in rows:
|
|
931
2213
|
# TODO: use namedtuple instead of dict
|
|
932
2214
|
records.append({
|
|
933
|
-
'name': name,
|
|
934
|
-
'launched_at': launched_at,
|
|
935
|
-
'handle': pickle.loads(handle),
|
|
936
|
-
'last_use': last_use,
|
|
937
|
-
'status': status_lib.StorageStatus[status],
|
|
2215
|
+
'name': row.name,
|
|
2216
|
+
'launched_at': row.launched_at,
|
|
2217
|
+
'handle': pickle.loads(row.handle),
|
|
2218
|
+
'last_use': row.last_use,
|
|
2219
|
+
'status': status_lib.StorageStatus[row.status],
|
|
2220
|
+
})
|
|
2221
|
+
return records
|
|
2222
|
+
|
|
2223
|
+
|
|
2224
|
+
@_init_db
|
|
2225
|
+
@metrics_lib.time_me
|
|
2226
|
+
def get_volume_names_start_with(starts_with: str) -> List[str]:
|
|
2227
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2228
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2229
|
+
rows = session.query(volume_table).filter(
|
|
2230
|
+
volume_table.c.name.like(f'{starts_with}%')).all()
|
|
2231
|
+
return [row.name for row in rows]
|
|
2232
|
+
|
|
2233
|
+
|
|
2234
|
+
@_init_db
|
|
2235
|
+
@metrics_lib.time_me
|
|
2236
|
+
def get_volumes() -> List[Dict[str, Any]]:
|
|
2237
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2238
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2239
|
+
rows = session.query(volume_table).all()
|
|
2240
|
+
records = []
|
|
2241
|
+
for row in rows:
|
|
2242
|
+
records.append({
|
|
2243
|
+
'name': row.name,
|
|
2244
|
+
'launched_at': row.launched_at,
|
|
2245
|
+
'handle': pickle.loads(row.handle),
|
|
2246
|
+
'user_hash': row.user_hash,
|
|
2247
|
+
'workspace': row.workspace,
|
|
2248
|
+
'last_attached_at': row.last_attached_at,
|
|
2249
|
+
'last_use': row.last_use,
|
|
2250
|
+
'status': status_lib.VolumeStatus[row.status],
|
|
938
2251
|
})
|
|
939
2252
|
return records
|
|
2253
|
+
|
|
2254
|
+
|
|
2255
|
+
@_init_db
|
|
2256
|
+
@metrics_lib.time_me
|
|
2257
|
+
def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
|
|
2258
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2259
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2260
|
+
row = session.query(volume_table).filter_by(name=name).first()
|
|
2261
|
+
if row:
|
|
2262
|
+
return {
|
|
2263
|
+
'name': row.name,
|
|
2264
|
+
'launched_at': row.launched_at,
|
|
2265
|
+
'handle': pickle.loads(row.handle),
|
|
2266
|
+
'user_hash': row.user_hash,
|
|
2267
|
+
'workspace': row.workspace,
|
|
2268
|
+
'last_attached_at': row.last_attached_at,
|
|
2269
|
+
'last_use': row.last_use,
|
|
2270
|
+
'status': status_lib.VolumeStatus[row.status],
|
|
2271
|
+
}
|
|
2272
|
+
return None
|
|
2273
|
+
|
|
2274
|
+
|
|
2275
|
+
@_init_db
|
|
2276
|
+
@metrics_lib.time_me
|
|
2277
|
+
def add_volume(name: str, config: models.VolumeConfig,
|
|
2278
|
+
status: status_lib.VolumeStatus) -> None:
|
|
2279
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2280
|
+
volume_launched_at = int(time.time())
|
|
2281
|
+
handle = pickle.dumps(config)
|
|
2282
|
+
last_use = common_utils.get_current_command()
|
|
2283
|
+
user_hash = common_utils.get_current_user().id
|
|
2284
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
2285
|
+
|
|
2286
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2287
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2288
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2289
|
+
insert_func = sqlite.insert
|
|
2290
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2291
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2292
|
+
insert_func = postgresql.insert
|
|
2293
|
+
else:
|
|
2294
|
+
raise ValueError('Unsupported database dialect')
|
|
2295
|
+
insert_stmnt = insert_func(volume_table).values(
|
|
2296
|
+
name=name,
|
|
2297
|
+
launched_at=volume_launched_at,
|
|
2298
|
+
handle=handle,
|
|
2299
|
+
user_hash=user_hash,
|
|
2300
|
+
workspace=active_workspace,
|
|
2301
|
+
last_attached_at=None,
|
|
2302
|
+
last_use=last_use,
|
|
2303
|
+
status=status.value,
|
|
2304
|
+
)
|
|
2305
|
+
do_update_stmt = insert_stmnt.on_conflict_do_nothing()
|
|
2306
|
+
session.execute(do_update_stmt)
|
|
2307
|
+
session.commit()
|
|
2308
|
+
|
|
2309
|
+
|
|
2310
|
+
@_init_db
|
|
2311
|
+
@metrics_lib.time_me
|
|
2312
|
+
def update_volume(name: str, last_attached_at: int,
|
|
2313
|
+
status: status_lib.VolumeStatus) -> None:
|
|
2314
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2315
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2316
|
+
session.query(volume_table).filter_by(name=name).update({
|
|
2317
|
+
volume_table.c.last_attached_at: last_attached_at,
|
|
2318
|
+
volume_table.c.status: status.value,
|
|
2319
|
+
})
|
|
2320
|
+
session.commit()
|
|
2321
|
+
|
|
2322
|
+
|
|
2323
|
+
@_init_db
|
|
2324
|
+
@metrics_lib.time_me
|
|
2325
|
+
def update_volume_status(name: str, status: status_lib.VolumeStatus) -> None:
|
|
2326
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2327
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2328
|
+
session.query(volume_table).filter_by(name=name).update({
|
|
2329
|
+
volume_table.c.status: status.value,
|
|
2330
|
+
})
|
|
2331
|
+
session.commit()
|
|
2332
|
+
|
|
2333
|
+
|
|
2334
|
+
@_init_db
|
|
2335
|
+
@metrics_lib.time_me
|
|
2336
|
+
def delete_volume(name: str) -> None:
|
|
2337
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2338
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2339
|
+
session.query(volume_table).filter_by(name=name).delete()
|
|
2340
|
+
session.commit()
|
|
2341
|
+
|
|
2342
|
+
|
|
2343
|
+
@_init_db
|
|
2344
|
+
@metrics_lib.time_me
|
|
2345
|
+
def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
|
|
2346
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2347
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2348
|
+
row = session.query(ssh_key_table).filter_by(
|
|
2349
|
+
user_hash=user_hash).first()
|
|
2350
|
+
if row:
|
|
2351
|
+
return row.ssh_public_key, row.ssh_private_key, True
|
|
2352
|
+
return '', '', False
|
|
2353
|
+
|
|
2354
|
+
|
|
2355
|
+
@_init_db
|
|
2356
|
+
@metrics_lib.time_me
|
|
2357
|
+
def set_ssh_keys(user_hash: str, ssh_public_key: str, ssh_private_key: str):
|
|
2358
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2359
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2360
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2361
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2362
|
+
insert_func = sqlite.insert
|
|
2363
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2364
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2365
|
+
insert_func = postgresql.insert
|
|
2366
|
+
else:
|
|
2367
|
+
raise ValueError('Unsupported database dialect')
|
|
2368
|
+
insert_stmnt = insert_func(ssh_key_table).values(
|
|
2369
|
+
user_hash=user_hash,
|
|
2370
|
+
ssh_public_key=ssh_public_key,
|
|
2371
|
+
ssh_private_key=ssh_private_key)
|
|
2372
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
2373
|
+
index_elements=[ssh_key_table.c.user_hash],
|
|
2374
|
+
set_={
|
|
2375
|
+
ssh_key_table.c.ssh_public_key: ssh_public_key,
|
|
2376
|
+
ssh_key_table.c.ssh_private_key: ssh_private_key
|
|
2377
|
+
})
|
|
2378
|
+
session.execute(do_update_stmt)
|
|
2379
|
+
session.commit()
|
|
2380
|
+
|
|
2381
|
+
|
|
2382
|
+
@_init_db
|
|
2383
|
+
@metrics_lib.time_me
|
|
2384
|
+
def add_service_account_token(token_id: str,
|
|
2385
|
+
token_name: str,
|
|
2386
|
+
token_hash: str,
|
|
2387
|
+
creator_user_hash: str,
|
|
2388
|
+
service_account_user_id: str,
|
|
2389
|
+
expires_at: Optional[int] = None) -> None:
|
|
2390
|
+
"""Add a service account token to the database."""
|
|
2391
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2392
|
+
created_at = int(time.time())
|
|
2393
|
+
|
|
2394
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2395
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2396
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2397
|
+
insert_func = sqlite.insert
|
|
2398
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2399
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2400
|
+
insert_func = postgresql.insert
|
|
2401
|
+
else:
|
|
2402
|
+
raise ValueError('Unsupported database dialect')
|
|
2403
|
+
|
|
2404
|
+
insert_stmnt = insert_func(service_account_token_table).values(
|
|
2405
|
+
token_id=token_id,
|
|
2406
|
+
token_name=token_name,
|
|
2407
|
+
token_hash=token_hash,
|
|
2408
|
+
created_at=created_at,
|
|
2409
|
+
expires_at=expires_at,
|
|
2410
|
+
creator_user_hash=creator_user_hash,
|
|
2411
|
+
service_account_user_id=service_account_user_id)
|
|
2412
|
+
session.execute(insert_stmnt)
|
|
2413
|
+
session.commit()
|
|
2414
|
+
|
|
2415
|
+
|
|
2416
|
+
@_init_db
|
|
2417
|
+
@metrics_lib.time_me
|
|
2418
|
+
def get_service_account_token(token_id: str) -> Optional[Dict[str, Any]]:
|
|
2419
|
+
"""Get a service account token by token_id."""
|
|
2420
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2421
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2422
|
+
row = session.query(service_account_token_table).filter_by(
|
|
2423
|
+
token_id=token_id).first()
|
|
2424
|
+
if row is None:
|
|
2425
|
+
return None
|
|
2426
|
+
return {
|
|
2427
|
+
'token_id': row.token_id,
|
|
2428
|
+
'token_name': row.token_name,
|
|
2429
|
+
'token_hash': row.token_hash,
|
|
2430
|
+
'created_at': row.created_at,
|
|
2431
|
+
'last_used_at': row.last_used_at,
|
|
2432
|
+
'expires_at': row.expires_at,
|
|
2433
|
+
'creator_user_hash': row.creator_user_hash,
|
|
2434
|
+
'service_account_user_id': row.service_account_user_id,
|
|
2435
|
+
}
|
|
2436
|
+
|
|
2437
|
+
|
|
2438
|
+
@_init_db
|
|
2439
|
+
@metrics_lib.time_me
|
|
2440
|
+
def get_user_service_account_tokens(user_hash: str) -> List[Dict[str, Any]]:
|
|
2441
|
+
"""Get all service account tokens for a user (as creator)."""
|
|
2442
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2443
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2444
|
+
rows = session.query(service_account_token_table).filter_by(
|
|
2445
|
+
creator_user_hash=user_hash).all()
|
|
2446
|
+
return [{
|
|
2447
|
+
'token_id': row.token_id,
|
|
2448
|
+
'token_name': row.token_name,
|
|
2449
|
+
'token_hash': row.token_hash,
|
|
2450
|
+
'created_at': row.created_at,
|
|
2451
|
+
'last_used_at': row.last_used_at,
|
|
2452
|
+
'expires_at': row.expires_at,
|
|
2453
|
+
'creator_user_hash': row.creator_user_hash,
|
|
2454
|
+
'service_account_user_id': row.service_account_user_id,
|
|
2455
|
+
} for row in rows]
|
|
2456
|
+
|
|
2457
|
+
|
|
2458
|
+
@_init_db
|
|
2459
|
+
@metrics_lib.time_me
|
|
2460
|
+
def update_service_account_token_last_used(token_id: str) -> None:
|
|
2461
|
+
"""Update the last_used_at timestamp for a service account token."""
|
|
2462
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2463
|
+
last_used_at = int(time.time())
|
|
2464
|
+
|
|
2465
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2466
|
+
session.query(service_account_token_table).filter_by(
|
|
2467
|
+
token_id=token_id).update(
|
|
2468
|
+
{service_account_token_table.c.last_used_at: last_used_at})
|
|
2469
|
+
session.commit()
|
|
2470
|
+
|
|
2471
|
+
|
|
2472
|
+
@_init_db
|
|
2473
|
+
@metrics_lib.time_me
|
|
2474
|
+
def delete_service_account_token(token_id: str) -> bool:
|
|
2475
|
+
"""Delete a service account token.
|
|
2476
|
+
|
|
2477
|
+
Returns:
|
|
2478
|
+
True if token was found and deleted.
|
|
2479
|
+
"""
|
|
2480
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2481
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2482
|
+
result = session.query(service_account_token_table).filter_by(
|
|
2483
|
+
token_id=token_id).delete()
|
|
2484
|
+
session.commit()
|
|
2485
|
+
return result > 0
|
|
2486
|
+
|
|
2487
|
+
|
|
2488
|
+
@_init_db
|
|
2489
|
+
@metrics_lib.time_me
|
|
2490
|
+
def rotate_service_account_token(token_id: str,
|
|
2491
|
+
new_token_hash: str,
|
|
2492
|
+
new_expires_at: Optional[int] = None) -> None:
|
|
2493
|
+
"""Rotate a service account token by updating its hash and expiration.
|
|
2494
|
+
|
|
2495
|
+
Args:
|
|
2496
|
+
token_id: The token ID to rotate.
|
|
2497
|
+
new_token_hash: The new hashed token value.
|
|
2498
|
+
new_expires_at: New expiration timestamp, or None for no expiration.
|
|
2499
|
+
"""
|
|
2500
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2501
|
+
current_time = int(time.time())
|
|
2502
|
+
|
|
2503
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2504
|
+
count = session.query(service_account_token_table).filter_by(
|
|
2505
|
+
token_id=token_id
|
|
2506
|
+
).update({
|
|
2507
|
+
service_account_token_table.c.token_hash: new_token_hash,
|
|
2508
|
+
service_account_token_table.c.expires_at: new_expires_at,
|
|
2509
|
+
service_account_token_table.c.last_used_at: None, # Reset last used
|
|
2510
|
+
# Update creation time
|
|
2511
|
+
service_account_token_table.c.created_at: current_time,
|
|
2512
|
+
})
|
|
2513
|
+
session.commit()
|
|
2514
|
+
|
|
2515
|
+
if count == 0:
|
|
2516
|
+
raise ValueError(f'Service account token {token_id} not found.')
|
|
2517
|
+
|
|
2518
|
+
|
|
2519
|
+
@_init_db
|
|
2520
|
+
@metrics_lib.time_me
|
|
2521
|
+
def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
|
|
2522
|
+
"""Get the cluster yaml from the database or the local file system.
|
|
2523
|
+
If the cluster yaml is not in the database, check if it exists on the
|
|
2524
|
+
local file system and migrate it to the database.
|
|
2525
|
+
|
|
2526
|
+
It is assumed that the cluster yaml file is named as <cluster_name>.yml.
|
|
2527
|
+
"""
|
|
2528
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2529
|
+
if cluster_yaml_path is None:
|
|
2530
|
+
raise ValueError('Attempted to read a None YAML.')
|
|
2531
|
+
cluster_file_name = os.path.basename(cluster_yaml_path)
|
|
2532
|
+
cluster_name, _ = os.path.splitext(cluster_file_name)
|
|
2533
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2534
|
+
row = session.query(cluster_yaml_table).filter_by(
|
|
2535
|
+
cluster_name=cluster_name).first()
|
|
2536
|
+
if row is None:
|
|
2537
|
+
return _set_cluster_yaml_from_file(cluster_yaml_path, cluster_name)
|
|
2538
|
+
return row.yaml
|
|
2539
|
+
|
|
2540
|
+
|
|
2541
|
+
def get_cluster_yaml_str_multiple(cluster_yaml_paths: List[str]) -> List[str]:
|
|
2542
|
+
"""Get the cluster yaml from the database or the local file system.
|
|
2543
|
+
"""
|
|
2544
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2545
|
+
cluster_names_to_yaml_paths = {}
|
|
2546
|
+
for cluster_yaml_path in cluster_yaml_paths:
|
|
2547
|
+
cluster_name, _ = os.path.splitext(os.path.basename(cluster_yaml_path))
|
|
2548
|
+
cluster_names_to_yaml_paths[cluster_name] = cluster_yaml_path
|
|
2549
|
+
|
|
2550
|
+
cluster_names = list(cluster_names_to_yaml_paths.keys())
|
|
2551
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2552
|
+
rows = session.query(cluster_yaml_table).filter(
|
|
2553
|
+
cluster_yaml_table.c.cluster_name.in_(cluster_names)).all()
|
|
2554
|
+
row_cluster_names_to_yaml = {row.cluster_name: row.yaml for row in rows}
|
|
2555
|
+
|
|
2556
|
+
yaml_strs = []
|
|
2557
|
+
for cluster_name in cluster_names:
|
|
2558
|
+
if cluster_name in row_cluster_names_to_yaml:
|
|
2559
|
+
yaml_strs.append(row_cluster_names_to_yaml[cluster_name])
|
|
2560
|
+
else:
|
|
2561
|
+
yaml_str = _set_cluster_yaml_from_file(
|
|
2562
|
+
cluster_names_to_yaml_paths[cluster_name], cluster_name)
|
|
2563
|
+
yaml_strs.append(yaml_str)
|
|
2564
|
+
return yaml_strs
|
|
2565
|
+
|
|
2566
|
+
|
|
2567
|
+
def _set_cluster_yaml_from_file(cluster_yaml_path: str,
|
|
2568
|
+
cluster_name: str) -> Optional[str]:
|
|
2569
|
+
"""Set the cluster yaml in the database from a file."""
|
|
2570
|
+
# If the cluster yaml is not in the database, check if it exists
|
|
2571
|
+
# on the local file system and migrate it to the database.
|
|
2572
|
+
# TODO(syang): remove this check once we have a way to migrate the
|
|
2573
|
+
# cluster from file to database. Remove on v0.12.0.
|
|
2574
|
+
if cluster_yaml_path is not None:
|
|
2575
|
+
# First try the exact path
|
|
2576
|
+
path_to_read = None
|
|
2577
|
+
if os.path.exists(cluster_yaml_path):
|
|
2578
|
+
path_to_read = cluster_yaml_path
|
|
2579
|
+
# Fallback: try with .debug suffix (when debug logging was enabled)
|
|
2580
|
+
# Debug logging causes YAML files to be saved with .debug suffix
|
|
2581
|
+
# but the path stored in the handle doesn't include it
|
|
2582
|
+
debug_path = cluster_yaml_path + '.debug'
|
|
2583
|
+
if os.path.exists(debug_path):
|
|
2584
|
+
path_to_read = debug_path
|
|
2585
|
+
if path_to_read is not None:
|
|
2586
|
+
with open(path_to_read, 'r', encoding='utf-8') as f:
|
|
2587
|
+
yaml_str = f.read()
|
|
2588
|
+
set_cluster_yaml(cluster_name, yaml_str)
|
|
2589
|
+
return yaml_str
|
|
2590
|
+
return None
|
|
2591
|
+
|
|
2592
|
+
|
|
2593
|
+
def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
|
2594
|
+
"""Get the cluster yaml as a dictionary from the database.
|
|
2595
|
+
|
|
2596
|
+
It is assumed that the cluster yaml file is named as <cluster_name>.yml.
|
|
2597
|
+
"""
|
|
2598
|
+
yaml_str = get_cluster_yaml_str(cluster_yaml_path)
|
|
2599
|
+
if yaml_str is None:
|
|
2600
|
+
raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
|
|
2601
|
+
return yaml_utils.safe_load(yaml_str)
|
|
2602
|
+
|
|
2603
|
+
|
|
2604
|
+
def get_cluster_yaml_dict_multiple(
|
|
2605
|
+
cluster_yaml_paths: List[str]) -> List[Dict[str, Any]]:
|
|
2606
|
+
"""Get the cluster yaml as a dictionary from the database."""
|
|
2607
|
+
yaml_strs = get_cluster_yaml_str_multiple(cluster_yaml_paths)
|
|
2608
|
+
yaml_dicts = []
|
|
2609
|
+
for idx, yaml_str in enumerate(yaml_strs):
|
|
2610
|
+
if yaml_str is None:
|
|
2611
|
+
raise ValueError(
|
|
2612
|
+
f'Cluster yaml {cluster_yaml_paths[idx]} not found.')
|
|
2613
|
+
yaml_dicts.append(yaml_utils.safe_load(yaml_str))
|
|
2614
|
+
return yaml_dicts
|
|
2615
|
+
|
|
2616
|
+
|
|
2617
|
+
@_init_db
|
|
2618
|
+
@metrics_lib.time_me
|
|
2619
|
+
def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
|
|
2620
|
+
"""Set the cluster yaml in the database."""
|
|
2621
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2622
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2623
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2624
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2625
|
+
insert_func = sqlite.insert
|
|
2626
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2627
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2628
|
+
insert_func = postgresql.insert
|
|
2629
|
+
else:
|
|
2630
|
+
raise ValueError('Unsupported database dialect')
|
|
2631
|
+
insert_stmnt = insert_func(cluster_yaml_table).values(
|
|
2632
|
+
cluster_name=cluster_name, yaml=yaml_str)
|
|
2633
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
2634
|
+
index_elements=[cluster_yaml_table.c.cluster_name],
|
|
2635
|
+
set_={cluster_yaml_table.c.yaml: yaml_str})
|
|
2636
|
+
session.execute(do_update_stmt)
|
|
2637
|
+
session.commit()
|
|
2638
|
+
|
|
2639
|
+
|
|
2640
|
+
@_init_db
|
|
2641
|
+
@metrics_lib.time_me
|
|
2642
|
+
def remove_cluster_yaml(cluster_name: str):
|
|
2643
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2644
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2645
|
+
session.query(cluster_yaml_table).filter_by(
|
|
2646
|
+
cluster_name=cluster_name).delete()
|
|
2647
|
+
session.commit()
|
|
2648
|
+
|
|
2649
|
+
|
|
2650
|
+
@_init_db
|
|
2651
|
+
@metrics_lib.time_me
|
|
2652
|
+
def get_all_service_account_tokens() -> List[Dict[str, Any]]:
|
|
2653
|
+
"""Get all service account tokens across all users (for admin access)."""
|
|
2654
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2655
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2656
|
+
rows = session.query(service_account_token_table).all()
|
|
2657
|
+
return [{
|
|
2658
|
+
'token_id': row.token_id,
|
|
2659
|
+
'token_name': row.token_name,
|
|
2660
|
+
'token_hash': row.token_hash,
|
|
2661
|
+
'created_at': row.created_at,
|
|
2662
|
+
'last_used_at': row.last_used_at,
|
|
2663
|
+
'expires_at': row.expires_at,
|
|
2664
|
+
'creator_user_hash': row.creator_user_hash,
|
|
2665
|
+
'service_account_user_id': row.service_account_user_id,
|
|
2666
|
+
} for row in rows]
|
|
2667
|
+
|
|
2668
|
+
|
|
2669
|
+
@_init_db
|
|
2670
|
+
@metrics_lib.time_me
|
|
2671
|
+
def get_system_config(config_key: str) -> Optional[str]:
|
|
2672
|
+
"""Get a system configuration value by key."""
|
|
2673
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2674
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2675
|
+
row = session.query(system_config_table).filter_by(
|
|
2676
|
+
config_key=config_key).first()
|
|
2677
|
+
if row is None:
|
|
2678
|
+
return None
|
|
2679
|
+
return row.config_value
|
|
2680
|
+
|
|
2681
|
+
|
|
2682
|
+
@_init_db
|
|
2683
|
+
@metrics_lib.time_me
|
|
2684
|
+
def set_system_config(config_key: str, config_value: str) -> None:
|
|
2685
|
+
"""Set a system configuration value."""
|
|
2686
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2687
|
+
current_time = int(time.time())
|
|
2688
|
+
|
|
2689
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2690
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2691
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2692
|
+
insert_func = sqlite.insert
|
|
2693
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2694
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2695
|
+
insert_func = postgresql.insert
|
|
2696
|
+
else:
|
|
2697
|
+
raise ValueError('Unsupported database dialect')
|
|
2698
|
+
|
|
2699
|
+
insert_stmnt = insert_func(system_config_table).values(
|
|
2700
|
+
config_key=config_key,
|
|
2701
|
+
config_value=config_value,
|
|
2702
|
+
created_at=current_time,
|
|
2703
|
+
updated_at=current_time)
|
|
2704
|
+
|
|
2705
|
+
upsert_stmnt = insert_stmnt.on_conflict_do_update(
|
|
2706
|
+
index_elements=[system_config_table.c.config_key],
|
|
2707
|
+
set_={
|
|
2708
|
+
system_config_table.c.config_value: config_value,
|
|
2709
|
+
system_config_table.c.updated_at: current_time,
|
|
2710
|
+
})
|
|
2711
|
+
session.execute(upsert_stmnt)
|
|
2712
|
+
session.commit()
|
|
2713
|
+
|
|
2714
|
+
|
|
2715
|
+
@_init_db
|
|
2716
|
+
def get_max_db_connections() -> Optional[int]:
|
|
2717
|
+
"""Get the maximum number of connections for the engine."""
|
|
2718
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2719
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
2720
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
2721
|
+
return None
|
|
2722
|
+
with sqlalchemy.orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2723
|
+
max_connections = session.execute(
|
|
2724
|
+
sqlalchemy.text('SHOW max_connections')).scalar()
|
|
2725
|
+
if max_connections is None:
|
|
2726
|
+
return None
|
|
2727
|
+
return int(max_connections)
|