dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +80 -0
- dstack/_internal/cli/commands/apply.py +100 -0
- dstack/_internal/cli/commands/attach.py +161 -0
- dstack/_internal/cli/commands/completion.py +22 -0
- dstack/_internal/cli/commands/delete.py +44 -0
- dstack/_internal/cli/commands/event.py +168 -0
- dstack/_internal/cli/commands/fleet.py +161 -0
- dstack/_internal/cli/commands/gateway.py +159 -0
- dstack/_internal/cli/commands/init.py +64 -0
- dstack/_internal/cli/commands/login.py +352 -0
- dstack/_internal/cli/commands/logs.py +62 -0
- dstack/_internal/cli/commands/metrics.py +153 -0
- dstack/_internal/cli/commands/offer.py +146 -0
- dstack/_internal/cli/commands/project.py +259 -0
- dstack/_internal/cli/commands/ps.py +81 -0
- dstack/_internal/cli/commands/run.py +69 -0
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/commands/server.py +96 -0
- dstack/_internal/cli/commands/stop.py +26 -0
- dstack/_internal/cli/commands/volume.py +117 -0
- dstack/_internal/cli/main.py +101 -0
- dstack/_internal/cli/models/gateways.py +16 -0
- dstack/_internal/cli/models/offers.py +47 -0
- dstack/_internal/cli/models/runs.py +16 -0
- dstack/_internal/cli/services/args.py +31 -0
- dstack/_internal/cli/services/completion.py +91 -0
- dstack/_internal/cli/services/configurators/__init__.py +86 -0
- dstack/_internal/cli/services/configurators/base.py +103 -0
- dstack/_internal/cli/services/configurators/fleet.py +475 -0
- dstack/_internal/cli/services/configurators/gateway.py +231 -0
- dstack/_internal/cli/services/configurators/run.py +882 -0
- dstack/_internal/cli/services/configurators/volume.py +222 -0
- dstack/_internal/cli/services/events.py +68 -0
- dstack/_internal/cli/services/profile.py +182 -0
- dstack/_internal/cli/services/repos.py +71 -0
- dstack/_internal/cli/services/resources.py +54 -0
- dstack/_internal/cli/utils/common.py +159 -0
- dstack/_internal/cli/utils/fleet.py +106 -0
- dstack/_internal/cli/utils/gateway.py +56 -0
- dstack/_internal/cli/utils/gpu.py +178 -0
- dstack/_internal/cli/utils/rich.py +156 -0
- dstack/_internal/cli/utils/run.py +517 -0
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/cli/utils/updates.py +98 -0
- dstack/_internal/cli/utils/volume.py +58 -0
- dstack/_internal/compat.py +3 -0
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/auth.py +30 -0
- dstack/_internal/core/backends/aws/backend.py +31 -0
- dstack/_internal/core/backends/aws/compute.py +1153 -0
- dstack/_internal/core/backends/aws/configurator.py +191 -0
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +700 -0
- dstack/_internal/core/backends/azure/auth.py +39 -0
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +676 -0
- dstack/_internal/core/backends/azure/configurator.py +472 -0
- dstack/_internal/core/backends/azure/models.py +98 -0
- dstack/_internal/core/backends/azure/resources.py +116 -0
- dstack/_internal/core/backends/azure/utils.py +42 -0
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +1101 -0
- dstack/_internal/core/backends/base/configurator.py +117 -0
- dstack/_internal/core/backends/base/models.py +24 -0
- dstack/_internal/core/backends/base/offers.py +232 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +181 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -0
- dstack/_internal/core/backends/cudo/api_client.py +111 -0
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +174 -0
- dstack/_internal/core/backends/cudo/configurator.py +63 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
- dstack/_internal/core/backends/datacrunch/backend.py +18 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -0
- dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/dstack/__init__.py +0 -0
- dstack/_internal/core/backends/dstack/models.py +26 -0
- dstack/_internal/core/backends/features.py +74 -0
- dstack/_internal/core/backends/gcp/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/auth.py +57 -0
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +1257 -0
- dstack/_internal/core/backends/gcp/configurator.py +206 -0
- dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
- dstack/_internal/core/backends/gcp/models.py +160 -0
- dstack/_internal/core/backends/gcp/resources.py +585 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +188 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
- dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
- dstack/_internal/core/backends/kubernetes/models.py +71 -0
- dstack/_internal/core/backends/kubernetes/utils.py +81 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
- dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
- dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -0
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +130 -0
- dstack/_internal/core/backends/models.py +158 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +401 -0
- dstack/_internal/core/backends/nebius/configurator.py +98 -0
- dstack/_internal/core/backends/nebius/models.py +185 -0
- dstack/_internal/core/backends/nebius/resources.py +433 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -0
- dstack/_internal/core/backends/oci/auth.py +21 -0
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +209 -0
- dstack/_internal/core/backends/oci/configurator.py +156 -0
- dstack/_internal/core/backends/oci/exceptions.py +15 -0
- dstack/_internal/core/backends/oci/models.py +87 -0
- dstack/_internal/core/backends/oci/region.py +86 -0
- dstack/_internal/core/backends/oci/resources.py +836 -0
- dstack/_internal/core/backends/runpod/__init__.py +0 -0
- dstack/_internal/core/backends/runpod/api_client.py +627 -0
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +444 -0
- dstack/_internal/core/backends/runpod/configurator.py +63 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/template/backend.py.jinja +16 -0
- dstack/_internal/core/backends/template/compute.py.jinja +95 -0
- dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
- dstack/_internal/core/backends/template/models.py.jinja +62 -0
- dstack/_internal/core/backends/tensordock/models.py +40 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -0
- dstack/_internal/core/backends/vastai/api_client.py +143 -0
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +141 -0
- dstack/_internal/core/backends/vastai/configurator.py +69 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/verda/__init__.py +0 -0
- dstack/_internal/core/backends/verda/backend.py +16 -0
- dstack/_internal/core/backends/verda/compute.py +266 -0
- dstack/_internal/core/backends/verda/configurator.py +73 -0
- dstack/_internal/core/backends/verda/models.py +38 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -0
- dstack/_internal/core/backends/vultr/api_client.py +116 -0
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +167 -0
- dstack/_internal/core/backends/vultr/configurator.py +71 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/events.py +13 -0
- dstack/_internal/core/compatibility/fleets.py +58 -0
- dstack/_internal/core/compatibility/gateways.py +39 -0
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/logs.py +14 -0
- dstack/_internal/core/compatibility/runs.py +86 -0
- dstack/_internal/core/compatibility/volumes.py +37 -0
- dstack/_internal/core/consts.py +8 -0
- dstack/_internal/core/errors.py +160 -0
- dstack/_internal/core/models/__init__.py +0 -0
- dstack/_internal/core/models/auth.py +28 -0
- dstack/_internal/core/models/backends/__init__.py +0 -0
- dstack/_internal/core/models/backends/base.py +48 -0
- dstack/_internal/core/models/common.py +143 -0
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/config.py +28 -0
- dstack/_internal/core/models/configurations.py +1123 -0
- dstack/_internal/core/models/envs.py +149 -0
- dstack/_internal/core/models/events.py +98 -0
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/fleets.py +437 -0
- dstack/_internal/core/models/gateways.py +146 -0
- dstack/_internal/core/models/gpus.py +45 -0
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +346 -0
- dstack/_internal/core/models/logs.py +27 -0
- dstack/_internal/core/models/metrics.py +14 -0
- dstack/_internal/core/models/placement.py +27 -0
- dstack/_internal/core/models/profiles.py +431 -0
- dstack/_internal/core/models/projects.py +46 -0
- dstack/_internal/core/models/repos/__init__.py +34 -0
- dstack/_internal/core/models/repos/base.py +36 -0
- dstack/_internal/core/models/repos/local.py +96 -0
- dstack/_internal/core/models/repos/remote.py +341 -0
- dstack/_internal/core/models/repos/virtual.py +85 -0
- dstack/_internal/core/models/resources.py +424 -0
- dstack/_internal/core/models/routers.py +24 -0
- dstack/_internal/core/models/runs.py +618 -0
- dstack/_internal/core/models/secrets.py +16 -0
- dstack/_internal/core/models/server.py +7 -0
- dstack/_internal/core/models/services.py +76 -0
- dstack/_internal/core/models/unix.py +53 -0
- dstack/_internal/core/models/users.py +60 -0
- dstack/_internal/core/models/volumes.py +221 -0
- dstack/_internal/core/services/__init__.py +16 -0
- dstack/_internal/core/services/api_client.py +15 -0
- dstack/_internal/core/services/configs/__init__.py +116 -0
- dstack/_internal/core/services/diff.py +71 -0
- dstack/_internal/core/services/logs.py +58 -0
- dstack/_internal/core/services/profiles.py +46 -0
- dstack/_internal/core/services/repos.py +236 -0
- dstack/_internal/core/services/ssh/__init__.py +27 -0
- dstack/_internal/core/services/ssh/attach.py +241 -0
- dstack/_internal/core/services/ssh/client.py +113 -0
- dstack/_internal/core/services/ssh/key_manager.py +53 -0
- dstack/_internal/core/services/ssh/ports.py +89 -0
- dstack/_internal/core/services/ssh/tunnel.py +337 -0
- dstack/_internal/proxy/__init__.py +8 -0
- dstack/_internal/proxy/gateway/__init__.py +0 -0
- dstack/_internal/proxy/gateway/app.py +89 -0
- dstack/_internal/proxy/gateway/auth.py +26 -0
- dstack/_internal/proxy/gateway/const.py +7 -0
- dstack/_internal/proxy/gateway/deps.py +73 -0
- dstack/_internal/proxy/gateway/main.py +17 -0
- dstack/_internal/proxy/gateway/models.py +23 -0
- dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
- dstack/_internal/proxy/gateway/repo/repo.py +121 -0
- dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
- dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
- dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
- dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
- dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
- dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
- dstack/_internal/proxy/gateway/routers/auth.py +10 -0
- dstack/_internal/proxy/gateway/routers/config.py +28 -0
- dstack/_internal/proxy/gateway/routers/registry.py +124 -0
- dstack/_internal/proxy/gateway/routers/stats.py +18 -0
- dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
- dstack/_internal/proxy/gateway/schemas/common.py +5 -0
- dstack/_internal/proxy/gateway/schemas/config.py +9 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
- dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
- dstack/_internal/proxy/gateway/services/__init__.py +0 -0
- dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
- dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
- dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
- dstack/_internal/proxy/gateway/services/nginx.py +455 -0
- dstack/_internal/proxy/gateway/services/registry.py +426 -0
- dstack/_internal/proxy/gateway/services/server_client.py +95 -0
- dstack/_internal/proxy/gateway/services/stats.py +170 -0
- dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
- dstack/_internal/proxy/gateway/testing/common.py +13 -0
- dstack/_internal/proxy/lib/__init__.py +0 -0
- dstack/_internal/proxy/lib/auth.py +7 -0
- dstack/_internal/proxy/lib/deps.py +106 -0
- dstack/_internal/proxy/lib/errors.py +14 -0
- dstack/_internal/proxy/lib/models.py +112 -0
- dstack/_internal/proxy/lib/repo.py +27 -0
- dstack/_internal/proxy/lib/routers/__init__.py +0 -0
- dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
- dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
- dstack/_internal/proxy/lib/services/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
- dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
- dstack/_internal/proxy/lib/services/service_connection.py +160 -0
- dstack/_internal/proxy/lib/testing/__init__.py +0 -0
- dstack/_internal/proxy/lib/testing/auth.py +11 -0
- dstack/_internal/proxy/lib/testing/common.py +51 -0
- dstack/_internal/server/__init__.py +0 -0
- dstack/_internal/server/alembic.ini +100 -0
- dstack/_internal/server/app.py +432 -0
- dstack/_internal/server/background/__init__.py +142 -0
- dstack/_internal/server/background/tasks/__init__.py +0 -0
- dstack/_internal/server/background/tasks/common.py +24 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
- dstack/_internal/server/background/tasks/process_events.py +17 -0
- dstack/_internal/server/background/tasks/process_fleets.py +289 -0
- dstack/_internal/server/background/tasks/process_gateways.py +188 -0
- dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
- dstack/_internal/server/background/tasks/process_instances.py +1186 -0
- dstack/_internal/server/background/tasks/process_metrics.py +172 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
- dstack/_internal/server/background/tasks/process_runs.py +842 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
- dstack/_internal/server/background/tasks/process_volumes.py +129 -0
- dstack/_internal/server/compatibility/__init__.py +0 -0
- dstack/_internal/server/compatibility/common.py +20 -0
- dstack/_internal/server/compatibility/gpus.py +22 -0
- dstack/_internal/server/db.py +127 -0
- dstack/_internal/server/deps.py +19 -0
- dstack/_internal/server/main.py +4 -0
- dstack/_internal/server/migrations/__init__.py +0 -0
- dstack/_internal/server/migrations/env.py +112 -0
- dstack/_internal/server/migrations/script.py.mako +28 -0
- dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
- dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
- dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
- dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
- dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
- dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
- dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
- dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
- dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
- dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
- dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
- dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
- dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
- dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
- dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
- dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
- dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
- dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
- dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
- dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
- dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
- dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
- dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
- dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
- dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
- dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
- dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
- dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
- dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
- dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
- dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
- dstack/_internal/server/migrations/versions/__init__.py +0 -0
- dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
- dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
- dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
- dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
- dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
- dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
- dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
- dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
- dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
- dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
- dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
- dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
- dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
- dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
- dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
- dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
- dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
- dstack/_internal/server/models.py +930 -0
- dstack/_internal/server/routers/__init__.py +0 -0
- dstack/_internal/server/routers/auth.py +34 -0
- dstack/_internal/server/routers/backends.py +142 -0
- dstack/_internal/server/routers/events.py +60 -0
- dstack/_internal/server/routers/files.py +68 -0
- dstack/_internal/server/routers/fleets.py +202 -0
- dstack/_internal/server/routers/gateways.py +109 -0
- dstack/_internal/server/routers/gpus.py +32 -0
- dstack/_internal/server/routers/instances.py +77 -0
- dstack/_internal/server/routers/logs.py +34 -0
- dstack/_internal/server/routers/metrics.py +82 -0
- dstack/_internal/server/routers/projects.py +205 -0
- dstack/_internal/server/routers/prometheus.py +35 -0
- dstack/_internal/server/routers/repos.py +118 -0
- dstack/_internal/server/routers/runs.py +216 -0
- dstack/_internal/server/routers/secrets.py +86 -0
- dstack/_internal/server/routers/server.py +19 -0
- dstack/_internal/server/routers/users.py +158 -0
- dstack/_internal/server/routers/volumes.py +122 -0
- dstack/_internal/server/schemas/__init__.py +0 -0
- dstack/_internal/server/schemas/auth.py +83 -0
- dstack/_internal/server/schemas/backends.py +16 -0
- dstack/_internal/server/schemas/common.py +9 -0
- dstack/_internal/server/schemas/events.py +211 -0
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/fleets.py +49 -0
- dstack/_internal/server/schemas/gateways.py +31 -0
- dstack/_internal/server/schemas/gpus.py +26 -0
- dstack/_internal/server/schemas/health/__init__.py +0 -0
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +47 -0
- dstack/_internal/server/schemas/logs.py +17 -0
- dstack/_internal/server/schemas/projects.py +81 -0
- dstack/_internal/server/schemas/repos.py +24 -0
- dstack/_internal/server/schemas/runner.py +269 -0
- dstack/_internal/server/schemas/runs.py +66 -0
- dstack/_internal/server/schemas/secrets.py +16 -0
- dstack/_internal/server/schemas/users.py +72 -0
- dstack/_internal/server/schemas/volumes.py +29 -0
- dstack/_internal/server/security/__init__.py +0 -0
- dstack/_internal/server/security/permissions.py +251 -0
- dstack/_internal/server/services/__init__.py +0 -0
- dstack/_internal/server/services/auth.py +77 -0
- dstack/_internal/server/services/backends/__init__.py +404 -0
- dstack/_internal/server/services/backends/handlers.py +105 -0
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/config.py +279 -0
- dstack/_internal/server/services/docker.py +162 -0
- dstack/_internal/server/services/encryption/__init__.py +102 -0
- dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
- dstack/_internal/server/services/encryption/keys/aes.py +68 -0
- dstack/_internal/server/services/encryption/keys/base.py +19 -0
- dstack/_internal/server/services/encryption/keys/identity.py +28 -0
- dstack/_internal/server/services/events.py +477 -0
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1224 -0
- dstack/_internal/server/services/gateways/__init__.py +686 -0
- dstack/_internal/server/services/gateways/client.py +209 -0
- dstack/_internal/server/services/gateways/connection.py +139 -0
- dstack/_internal/server/services/gateways/pool.py +58 -0
- dstack/_internal/server/services/gpus.py +387 -0
- dstack/_internal/server/services/instances.py +731 -0
- dstack/_internal/server/services/jobs/__init__.py +840 -0
- dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/base.py +469 -0
- dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
- dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
- dstack/_internal/server/services/jobs/configurators/service.py +28 -0
- dstack/_internal/server/services/jobs/configurators/task.py +39 -0
- dstack/_internal/server/services/locking.py +187 -0
- dstack/_internal/server/services/logging.py +29 -0
- dstack/_internal/server/services/logs/__init__.py +122 -0
- dstack/_internal/server/services/logs/aws.py +373 -0
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +261 -0
- dstack/_internal/server/services/logs/fluentbit.py +329 -0
- dstack/_internal/server/services/logs/gcp.py +181 -0
- dstack/_internal/server/services/metrics.py +172 -0
- dstack/_internal/server/services/offers.py +249 -0
- dstack/_internal/server/services/permissions.py +37 -0
- dstack/_internal/server/services/placement.py +234 -0
- dstack/_internal/server/services/plugins.py +109 -0
- dstack/_internal/server/services/probes.py +10 -0
- dstack/_internal/server/services/projects.py +835 -0
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
- dstack/_internal/server/services/proxy/__init__.py +3 -0
- dstack/_internal/server/services/proxy/auth.py +12 -0
- dstack/_internal/server/services/proxy/deps.py +18 -0
- dstack/_internal/server/services/proxy/repo.py +189 -0
- dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
- dstack/_internal/server/services/proxy/services/__init__.py +0 -0
- dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
- dstack/_internal/server/services/repos.py +362 -0
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +260 -0
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/__init__.py +0 -0
- dstack/_internal/server/services/runner/client.py +646 -0
- dstack/_internal/server/services/runner/ssh.py +128 -0
- dstack/_internal/server/services/runs/__init__.py +1026 -0
- dstack/_internal/server/services/runs/plan.py +703 -0
- dstack/_internal/server/services/runs/replicas.py +317 -0
- dstack/_internal/server/services/runs/spec.py +191 -0
- dstack/_internal/server/services/secrets.py +245 -0
- dstack/_internal/server/services/services/__init__.py +345 -0
- dstack/_internal/server/services/services/autoscalers.py +140 -0
- dstack/_internal/server/services/services/options.py +53 -0
- dstack/_internal/server/services/ssh.py +67 -0
- dstack/_internal/server/services/storage/__init__.py +37 -0
- dstack/_internal/server/services/storage/base.py +48 -0
- dstack/_internal/server/services/storage/gcs.py +66 -0
- dstack/_internal/server/services/storage/s3.py +69 -0
- dstack/_internal/server/services/users.py +461 -0
- dstack/_internal/server/services/volumes.py +496 -0
- dstack/_internal/server/settings.py +161 -0
- dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
- dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
- dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
- dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
- dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
- dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
- dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
- dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
- dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
- dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
- dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
- dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
- dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
- dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
- dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
- dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
- dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
- dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
- dstack/_internal/server/statics/assets/favicon.ico +0 -0
- dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
- dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
- dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
- dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
- dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
- dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
- dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
- dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
- dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
- dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
- dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
- dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
- dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
- dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
- dstack/_internal/server/statics/index.html +3 -0
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
- dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
- dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
- dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
- dstack/_internal/server/testing/__init__.py +0 -0
- dstack/_internal/server/testing/common.py +1220 -0
- dstack/_internal/server/testing/conf.py +53 -0
- dstack/_internal/server/testing/matchers.py +31 -0
- dstack/_internal/server/utils/__init__.py +0 -0
- dstack/_internal/server/utils/common.py +55 -0
- dstack/_internal/server/utils/logging.py +51 -0
- dstack/_internal/server/utils/provisioning.py +368 -0
- dstack/_internal/server/utils/routers.py +166 -0
- dstack/_internal/server/utils/sentry_utils.py +24 -0
- dstack/_internal/settings.py +49 -0
- dstack/_internal/utils/__init__.py +0 -0
- dstack/_internal/utils/common.py +318 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/_internal/utils/crypto.py +40 -0
- dstack/_internal/utils/env.py +88 -0
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/gpu.py +59 -0
- dstack/_internal/utils/hash.py +31 -0
- dstack/_internal/utils/interpolator.py +91 -0
- dstack/_internal/utils/json_schema.py +11 -0
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/_internal/utils/logging.py +5 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/network.py +50 -0
- dstack/_internal/utils/path.py +57 -0
- dstack/_internal/utils/random_names.py +258 -0
- dstack/_internal/utils/ssh.py +346 -0
- dstack/_internal/utils/tags.py +42 -0
- dstack/_internal/utils/typing.py +14 -0
- dstack/_internal/utils/version.py +22 -0
- dstack/api/__init__.py +46 -0
- dstack/api/_public/__init__.py +96 -0
- dstack/api/_public/backends.py +42 -0
- dstack/api/_public/common.py +5 -0
- dstack/api/_public/repos.py +202 -0
- dstack/api/_public/runs.py +714 -0
- dstack/api/server/__init__.py +206 -0
- dstack/api/server/_auth.py +30 -0
- dstack/api/server/_backends.py +38 -0
- dstack/api/server/_events.py +64 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_fleets.py +82 -0
- dstack/api/server/_gateways.py +54 -0
- dstack/api/server/_gpus.py +27 -0
- dstack/api/server/_group.py +22 -0
- dstack/api/server/_logs.py +15 -0
- dstack/api/server/_metrics.py +23 -0
- dstack/api/server/_projects.py +124 -0
- dstack/api/server/_repos.py +64 -0
- dstack/api/server/_runs.py +102 -0
- dstack/api/server/_secrets.py +36 -0
- dstack/api/server/_users.py +82 -0
- dstack/api/server/_volumes.py +39 -0
- dstack/api/server/utils.py +34 -0
- dstack/api/utils.py +105 -0
- dstack/core/__init__.py +0 -0
- dstack/plugins/__init__.py +8 -0
- dstack/plugins/_base.py +72 -0
- dstack/plugins/_models.py +8 -0
- dstack/plugins/_utils.py +19 -0
- dstack/plugins/builtin/__init__.py +0 -0
- dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
- dstack/plugins/builtin/rest_plugin/_models.py +48 -0
- dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
- dstack/version.py +3 -1
- dstack-0.20.7.dist-info/METADATA +519 -0
- dstack-0.20.7.dist-info/RECORD +720 -0
- {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
- dstack-0.20.7.dist-info/entry_points.txt +2 -0
- dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
- dstack/aws/__init__.py +0 -180
- dstack/aws/artifacts.py +0 -111
- dstack/aws/config.py +0 -40
- dstack/aws/jobs.py +0 -245
- dstack/aws/logs.py +0 -186
- dstack/aws/repos.py +0 -137
- dstack/aws/run_names.py +0 -17
- dstack/aws/runners.py +0 -693
- dstack/aws/runs.py +0 -79
- dstack/aws/secrets.py +0 -99
- dstack/aws/tags.py +0 -138
- dstack/backend.py +0 -299
- dstack/cli/app.py +0 -41
- dstack/cli/artifacts.py +0 -87
- dstack/cli/common.py +0 -57
- dstack/cli/config.py +0 -194
- dstack/cli/dashboard.py +0 -26
- dstack/cli/delete.py +0 -49
- dstack/cli/init.py +0 -33
- dstack/cli/logs.py +0 -87
- dstack/cli/main.py +0 -81
- dstack/cli/restart.py +0 -43
- dstack/cli/run.py +0 -223
- dstack/cli/schema.py +0 -46
- dstack/cli/secrets.py +0 -97
- dstack/cli/status.py +0 -140
- dstack/cli/stop.py +0 -53
- dstack/cli/tags.py +0 -100
- dstack/config.py +0 -80
- dstack/dashboard/artifacts.py +0 -26
- dstack/dashboard/logs.py +0 -73
- dstack/dashboard/main.py +0 -45
- dstack/dashboard/repos.py +0 -41
- dstack/dashboard/runs.py +0 -140
- dstack/dashboard/secrets.py +0 -53
- dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
- dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/dashboard/statics/assets/browserconfig.xml +0 -15
- dstack/dashboard/statics/assets/coast-228x228.png +0 -0
- dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
- dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
- dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
- dstack/dashboard/statics/assets/favicon.ico +0 -0
- dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
- dstack/dashboard/statics/assets/manifest.webapp +0 -14
- dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
- dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
- dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
- dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
- dstack/dashboard/statics/index.html +0 -7
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
- dstack/dashboard/statics/main.css +0 -5058
- dstack/dashboard/statics/splash_thumbnail.png +0 -0
- dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
- dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
- dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
- dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
- dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
- dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
- dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
- dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
- dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
- dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
- dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
- dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
- dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
- dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
- dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
- dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
- dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
- dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
- dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
- dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
- dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
- dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
- dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
- dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
- dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
- dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
- dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
- dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
- dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
- dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
- dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
- dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
- dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
- dstack/dashboard/tags.py +0 -119
- dstack/jobs.py +0 -255
- dstack/providers/__init__.py +0 -316
- dstack/providers/_python/main.py +0 -88
- dstack/providers/_tensorboard/main.py +0 -93
- dstack/providers/_torchrun/main.py +0 -121
- dstack/providers/bash/main.py +0 -90
- dstack/providers/code/main.py +0 -95
- dstack/providers/docker/main.py +0 -79
- dstack/providers/lab/main.py +0 -95
- dstack/providers/notebook/main.py +0 -90
- dstack/random_name.py +0 -29
- dstack/repo.py +0 -135
- dstack/runners.py +0 -35
- dstack/util.py +0 -15
- dstack-0.0.9.dist-info/METADATA +0 -176
- dstack-0.0.9.dist-info/RECORD +0 -179
- dstack-0.0.9.dist-info/entry_points.txt +0 -3
- dstack-0.0.9.dist-info/top_level.txt +0 -2
- tests/test_config.py +0 -70
- /dstack/{cli → _internal}/__init__.py +0 -0
- /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
- /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
- /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
- /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
- /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
- /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
- /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
- /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
- /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
- {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
- /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
- /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from sqlalchemy import select
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.services.backends as backends_services
|
|
13
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
14
|
+
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
15
|
+
from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
|
|
16
|
+
from dstack._internal.core.errors import (
|
|
17
|
+
BackendError,
|
|
18
|
+
ResourceNotExistsError,
|
|
19
|
+
ServerClientError,
|
|
20
|
+
SSHError,
|
|
21
|
+
)
|
|
22
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
23
|
+
from dstack._internal.core.models.configurations import RunConfigurationType
|
|
24
|
+
from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason
|
|
25
|
+
from dstack._internal.core.models.runs import (
|
|
26
|
+
Job,
|
|
27
|
+
JobProvisioningData,
|
|
28
|
+
JobRuntimeData,
|
|
29
|
+
JobSpec,
|
|
30
|
+
JobStatus,
|
|
31
|
+
JobSubmission,
|
|
32
|
+
JobTerminationReason,
|
|
33
|
+
RunSpec,
|
|
34
|
+
)
|
|
35
|
+
from dstack._internal.core.models.volumes import Volume, VolumeMountPoint, VolumeStatus
|
|
36
|
+
from dstack._internal.server import settings
|
|
37
|
+
from dstack._internal.server.models import (
|
|
38
|
+
InstanceModel,
|
|
39
|
+
JobModel,
|
|
40
|
+
ProjectModel,
|
|
41
|
+
RunModel,
|
|
42
|
+
VolumeModel,
|
|
43
|
+
)
|
|
44
|
+
from dstack._internal.server.services import events, services
|
|
45
|
+
from dstack._internal.server.services import volumes as volumes_services
|
|
46
|
+
from dstack._internal.server.services.instances import (
|
|
47
|
+
format_instance_blocks_for_event,
|
|
48
|
+
get_instance_ssh_private_keys,
|
|
49
|
+
switch_instance_status,
|
|
50
|
+
)
|
|
51
|
+
from dstack._internal.server.services.jobs.configurators.base import (
|
|
52
|
+
JobConfigurator,
|
|
53
|
+
interpolate_job_volumes,
|
|
54
|
+
)
|
|
55
|
+
from dstack._internal.server.services.jobs.configurators.dev import DevEnvironmentJobConfigurator
|
|
56
|
+
from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
|
|
57
|
+
from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
|
|
58
|
+
from dstack._internal.server.services.logging import fmt
|
|
59
|
+
from dstack._internal.server.services.probes import probe_model_to_probe
|
|
60
|
+
from dstack._internal.server.services.runner import client
|
|
61
|
+
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
62
|
+
from dstack._internal.server.services.volumes import (
|
|
63
|
+
list_project_volume_models,
|
|
64
|
+
volume_model_to_volume,
|
|
65
|
+
)
|
|
66
|
+
from dstack._internal.utils import common
|
|
67
|
+
from dstack._internal.utils.common import get_or_error, run_async
|
|
68
|
+
from dstack._internal.utils.logging import get_logger
|
|
69
|
+
|
|
70
|
+
logger = get_logger(__name__)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def switch_job_status(
|
|
74
|
+
session: AsyncSession,
|
|
75
|
+
job_model: JobModel,
|
|
76
|
+
new_status: JobStatus,
|
|
77
|
+
actor: events.AnyActor = events.SystemActor(),
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Switch job status.
|
|
81
|
+
|
|
82
|
+
**NOTE**: When switching to `TERMINATING`, set `termination_reason` and preferably
|
|
83
|
+
`termination_reason_message` before calling this function.
|
|
84
|
+
"""
|
|
85
|
+
old_status = job_model.status
|
|
86
|
+
if old_status == new_status:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
job_model.status = new_status
|
|
90
|
+
|
|
91
|
+
msg = f"Job status changed {old_status.upper()} -> {new_status.upper()}"
|
|
92
|
+
if new_status == JobStatus.TERMINATING:
|
|
93
|
+
if job_model.termination_reason is None:
|
|
94
|
+
raise ValueError("termination_reason must be set when switching to TERMINATING status")
|
|
95
|
+
msg += f". Termination reason: {job_model.termination_reason.upper()}"
|
|
96
|
+
if job_model.termination_reason_message:
|
|
97
|
+
msg += f" ({job_model.termination_reason_message})"
|
|
98
|
+
events.emit(session, msg, actor=actor, targets=[events.Target.from_model(job_model)])
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def get_jobs_from_run_spec(
|
|
102
|
+
run_spec: RunSpec,
|
|
103
|
+
secrets: Dict[str, str],
|
|
104
|
+
replica_num: int,
|
|
105
|
+
replica_group_name: Optional[str] = None,
|
|
106
|
+
) -> List[Job]:
|
|
107
|
+
return [
|
|
108
|
+
Job(job_spec=s, job_submissions=[])
|
|
109
|
+
for s in await get_job_specs_from_run_spec(
|
|
110
|
+
run_spec=run_spec,
|
|
111
|
+
secrets=secrets,
|
|
112
|
+
replica_num=replica_num,
|
|
113
|
+
replica_group_name=replica_group_name,
|
|
114
|
+
)
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def get_job_specs_from_run_spec(
|
|
119
|
+
run_spec: RunSpec,
|
|
120
|
+
secrets: Dict[str, str],
|
|
121
|
+
replica_num: int,
|
|
122
|
+
replica_group_name: Optional[str] = None,
|
|
123
|
+
) -> List[JobSpec]:
|
|
124
|
+
job_configurator = _get_job_configurator(
|
|
125
|
+
run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name
|
|
126
|
+
)
|
|
127
|
+
job_specs = await job_configurator.get_job_specs(replica_num=replica_num)
|
|
128
|
+
return job_specs
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def find_job(jobs: List[Job], replica_num: int, job_num: int) -> Job:
|
|
132
|
+
for job in jobs:
|
|
133
|
+
if job.job_spec.replica_num == replica_num and job.job_spec.job_num == job_num:
|
|
134
|
+
return job
|
|
135
|
+
raise ResourceNotExistsError(
|
|
136
|
+
f"Job with replica_num={replica_num} and job_num={job_num} not found"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def find_jobs(
|
|
141
|
+
jobs: List[Job],
|
|
142
|
+
replica_num: Optional[int] = None,
|
|
143
|
+
job_num: Optional[int] = None,
|
|
144
|
+
) -> list[Job]:
|
|
145
|
+
res = jobs
|
|
146
|
+
if replica_num is not None:
|
|
147
|
+
res = [j for j in res if j.job_spec.replica_num == replica_num]
|
|
148
|
+
if job_num is not None:
|
|
149
|
+
res = [j for j in res if j.job_spec.job_num == job_num]
|
|
150
|
+
return res
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
async def get_run_job_model(
|
|
154
|
+
session: AsyncSession,
|
|
155
|
+
project: ProjectModel,
|
|
156
|
+
run_name: str,
|
|
157
|
+
run_id: Optional[UUID],
|
|
158
|
+
replica_num: int,
|
|
159
|
+
job_num: int,
|
|
160
|
+
) -> Optional[JobModel]:
|
|
161
|
+
filters = [
|
|
162
|
+
RunModel.project_id == project.id,
|
|
163
|
+
RunModel.run_name == run_name,
|
|
164
|
+
JobModel.replica_num == replica_num,
|
|
165
|
+
JobModel.job_num == job_num,
|
|
166
|
+
]
|
|
167
|
+
if run_id is not None:
|
|
168
|
+
filters.append(RunModel.id == run_id)
|
|
169
|
+
else:
|
|
170
|
+
# Assuming run_name is unique for non-deleted runs
|
|
171
|
+
filters.append(RunModel.deleted == False)
|
|
172
|
+
res = await session.execute(
|
|
173
|
+
select(JobModel)
|
|
174
|
+
.join(JobModel.run)
|
|
175
|
+
.where(*filters)
|
|
176
|
+
.order_by(JobModel.submission_num.desc())
|
|
177
|
+
.limit(1)
|
|
178
|
+
)
|
|
179
|
+
return res.scalar_one_or_none()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def job_model_to_job_submission(
|
|
183
|
+
job_model: JobModel, include_probes: bool = False
|
|
184
|
+
) -> JobSubmission:
|
|
185
|
+
job_provisioning_data = get_job_provisioning_data(job_model)
|
|
186
|
+
if job_provisioning_data is not None:
|
|
187
|
+
# TODO remove after transitioning to computed fields
|
|
188
|
+
job_provisioning_data.instance_type.resources.description = (
|
|
189
|
+
job_provisioning_data.instance_type.resources.pretty_format()
|
|
190
|
+
)
|
|
191
|
+
# TODO do we really still need this magic? See https://github.com/dstackai/dstack/pull/1682
|
|
192
|
+
# i.e., replacing `jpd.backend` with `jpd.get_base_backend()` should give the same result
|
|
193
|
+
if (
|
|
194
|
+
job_provisioning_data.backend == BackendType.DSTACK
|
|
195
|
+
and job_provisioning_data.backend_data is not None
|
|
196
|
+
):
|
|
197
|
+
backend_data = json.loads(job_provisioning_data.backend_data)
|
|
198
|
+
job_provisioning_data.backend = backend_data["base_backend"]
|
|
199
|
+
last_processed_at = job_model.last_processed_at
|
|
200
|
+
finished_at = None
|
|
201
|
+
if job_model.status.is_finished():
|
|
202
|
+
finished_at = last_processed_at
|
|
203
|
+
status_message = _get_job_status_message(job_model)
|
|
204
|
+
error = _get_job_error(job_model)
|
|
205
|
+
probes = []
|
|
206
|
+
if include_probes:
|
|
207
|
+
probes = [probe_model_to_probe(pm) for pm in job_model.probes]
|
|
208
|
+
return JobSubmission(
|
|
209
|
+
id=job_model.id,
|
|
210
|
+
submission_num=job_model.submission_num,
|
|
211
|
+
deployment_num=job_model.deployment_num,
|
|
212
|
+
submitted_at=job_model.submitted_at,
|
|
213
|
+
last_processed_at=last_processed_at,
|
|
214
|
+
finished_at=finished_at,
|
|
215
|
+
inactivity_secs=job_model.inactivity_secs,
|
|
216
|
+
status=job_model.status,
|
|
217
|
+
status_message=status_message,
|
|
218
|
+
termination_reason=job_model.termination_reason.value
|
|
219
|
+
if job_model.termination_reason
|
|
220
|
+
else None,
|
|
221
|
+
termination_reason_message=job_model.termination_reason_message,
|
|
222
|
+
exit_status=job_model.exit_status,
|
|
223
|
+
job_provisioning_data=job_provisioning_data,
|
|
224
|
+
job_runtime_data=get_job_runtime_data(job_model),
|
|
225
|
+
error=error,
|
|
226
|
+
probes=probes,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def get_job_provisioning_data(job_model: JobModel) -> Optional[JobProvisioningData]:
|
|
231
|
+
if job_model.job_provisioning_data is None:
|
|
232
|
+
return None
|
|
233
|
+
return JobProvisioningData.__response__.parse_raw(job_model.job_provisioning_data)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_job_runtime_data(job_model: JobModel) -> Optional[JobRuntimeData]:
|
|
237
|
+
if job_model.job_runtime_data is None:
|
|
238
|
+
return None
|
|
239
|
+
return JobRuntimeData.__response__.parse_raw(job_model.job_runtime_data)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def delay_job_instance_termination(job_model: JobModel):
|
|
243
|
+
job_model.remove_at = common.get_current_datetime() + timedelta(seconds=15)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def is_multinode_job(job: Job) -> bool:
|
|
247
|
+
return job.job_spec.jobs_per_replica > 1
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def is_master_job(job: Job) -> bool:
|
|
251
|
+
return job.job_spec.job_num == 0
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _get_job_configurator(
|
|
255
|
+
run_spec: RunSpec, secrets: Dict[str, str], replica_group_name: Optional[str] = None
|
|
256
|
+
) -> JobConfigurator:
|
|
257
|
+
configuration_type = RunConfigurationType(run_spec.configuration.type)
|
|
258
|
+
configurator_class = _configuration_type_to_configurator_class_map[configuration_type]
|
|
259
|
+
return configurator_class(
|
|
260
|
+
run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
_job_configurator_classes = [
|
|
265
|
+
DevEnvironmentJobConfigurator,
|
|
266
|
+
TaskJobConfigurator,
|
|
267
|
+
ServiceJobConfigurator,
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
_configuration_type_to_configurator_class_map = {c.TYPE: c for c in _job_configurator_classes}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
async def stop_runner(session: AsyncSession, job_model: JobModel):
|
|
274
|
+
res = await session.execute(
|
|
275
|
+
select(InstanceModel)
|
|
276
|
+
.where(
|
|
277
|
+
InstanceModel.project_id == job_model.project_id,
|
|
278
|
+
InstanceModel.id == job_model.instance_id,
|
|
279
|
+
)
|
|
280
|
+
.options(joinedload(InstanceModel.project))
|
|
281
|
+
)
|
|
282
|
+
instance: Optional[InstanceModel] = res.scalar()
|
|
283
|
+
|
|
284
|
+
ssh_private_keys = get_instance_ssh_private_keys(common.get_or_error(instance))
|
|
285
|
+
try:
|
|
286
|
+
jpd = get_job_provisioning_data(job_model)
|
|
287
|
+
if jpd is not None:
|
|
288
|
+
jrd = get_job_runtime_data(job_model)
|
|
289
|
+
await run_async(_stop_runner, ssh_private_keys, jpd, jrd, job_model)
|
|
290
|
+
except SSHError:
|
|
291
|
+
logger.debug("%s: failed to stop runner", fmt(job_model))
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
@runner_ssh_tunnel(ports=[DSTACK_RUNNER_HTTP_PORT])
|
|
295
|
+
def _stop_runner(
|
|
296
|
+
ports: dict[int, int],
|
|
297
|
+
job_model: JobModel,
|
|
298
|
+
):
|
|
299
|
+
logger.debug("%s: stopping runner", fmt(job_model))
|
|
300
|
+
runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
|
|
301
|
+
try:
|
|
302
|
+
runner_client.stop()
|
|
303
|
+
except requests.RequestException:
|
|
304
|
+
logger.exception("%s: failed to stop runner gracefully", fmt(job_model))
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
async def process_terminating_job(
|
|
308
|
+
session: AsyncSession,
|
|
309
|
+
job_model: JobModel,
|
|
310
|
+
instance_model: Optional[InstanceModel],
|
|
311
|
+
):
|
|
312
|
+
"""
|
|
313
|
+
Stops the job: tells shim to stop the container, detaches the job from the instance,
|
|
314
|
+
and detaches volumes from the instance.
|
|
315
|
+
Graceful stop should already be done by `process_terminating_run`.
|
|
316
|
+
Caller must acquire the locks on the job and the job's instance.
|
|
317
|
+
"""
|
|
318
|
+
if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
|
|
319
|
+
# it's too early to terminate the instance
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
if instance_model is None:
|
|
323
|
+
# Possible if the job hasn't been assigned an instance yet
|
|
324
|
+
await services.unregister_replica(session, job_model)
|
|
325
|
+
_set_job_termination_status(session, job_model)
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
all_volumes_detached: bool = True
|
|
329
|
+
jrd = get_job_runtime_data(job_model)
|
|
330
|
+
jpd = get_job_provisioning_data(job_model)
|
|
331
|
+
if jpd is not None:
|
|
332
|
+
logger.debug("%s: stopping container", fmt(job_model))
|
|
333
|
+
ssh_private_keys = get_instance_ssh_private_keys(instance_model)
|
|
334
|
+
if not await stop_container(job_model, jpd, ssh_private_keys):
|
|
335
|
+
# The dangling container can be removed later during instance processing
|
|
336
|
+
logger.warning(
|
|
337
|
+
(
|
|
338
|
+
"%s: could not stop container, possibly due to a communication error."
|
|
339
|
+
" See debug logs for details."
|
|
340
|
+
" Ignoring, can attempt to remove the container later"
|
|
341
|
+
),
|
|
342
|
+
fmt(job_model),
|
|
343
|
+
)
|
|
344
|
+
if jrd is not None and jrd.volume_names is not None:
|
|
345
|
+
volume_names = jrd.volume_names
|
|
346
|
+
else:
|
|
347
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
348
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
349
|
+
volume_models = await list_project_volume_models(
|
|
350
|
+
session=session, project=instance_model.project, names=volume_names
|
|
351
|
+
)
|
|
352
|
+
if len(volume_models) > 0:
|
|
353
|
+
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
354
|
+
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
355
|
+
project=instance_model.project,
|
|
356
|
+
job_model=job_model,
|
|
357
|
+
jpd=jpd,
|
|
358
|
+
instance_model=instance_model,
|
|
359
|
+
volume_models=volume_models,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
if jrd is not None and jrd.offer is not None:
|
|
363
|
+
blocks = jrd.offer.blocks
|
|
364
|
+
else:
|
|
365
|
+
# Old job submitted before jrd or blocks were introduced
|
|
366
|
+
blocks = 1
|
|
367
|
+
instance_model.busy_blocks -= blocks
|
|
368
|
+
|
|
369
|
+
if instance_model.status != InstanceStatus.BUSY or jpd is None or not jpd.dockerized:
|
|
370
|
+
# Terminate instances that:
|
|
371
|
+
# - have not finished provisioning yet
|
|
372
|
+
# - belong to container-based backends, and hence cannot be reused
|
|
373
|
+
if instance_model.status not in InstanceStatus.finished_statuses():
|
|
374
|
+
instance_model.termination_reason = InstanceTerminationReason.JOB_FINISHED
|
|
375
|
+
switch_instance_status(session, instance_model, InstanceStatus.TERMINATING)
|
|
376
|
+
elif not [j for j in instance_model.jobs if j.id != job_model.id]:
|
|
377
|
+
# no other jobs besides this one
|
|
378
|
+
switch_instance_status(session, instance_model, InstanceStatus.IDLE)
|
|
379
|
+
|
|
380
|
+
# The instance should be released even if detach fails
|
|
381
|
+
# so that stuck volumes don't prevent the instance from terminating.
|
|
382
|
+
job_model.instance_id = None
|
|
383
|
+
instance_model.last_job_processed_at = common.get_current_datetime()
|
|
384
|
+
|
|
385
|
+
events.emit(
|
|
386
|
+
session,
|
|
387
|
+
(
|
|
388
|
+
"Job unassigned from instance."
|
|
389
|
+
f" Instance blocks: {format_instance_blocks_for_event(instance_model)}"
|
|
390
|
+
),
|
|
391
|
+
actor=events.SystemActor(),
|
|
392
|
+
targets=[
|
|
393
|
+
events.Target.from_model(job_model),
|
|
394
|
+
events.Target.from_model(instance_model),
|
|
395
|
+
],
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
volume_names = (
|
|
399
|
+
jrd.volume_names
|
|
400
|
+
if jrd and jrd.volume_names
|
|
401
|
+
else [va.volume.name for va in instance_model.volume_attachments]
|
|
402
|
+
)
|
|
403
|
+
if volume_names:
|
|
404
|
+
volumes = await list_project_volume_models(
|
|
405
|
+
session=session, project=instance_model.project, names=volume_names
|
|
406
|
+
)
|
|
407
|
+
for volume in volumes:
|
|
408
|
+
volume.last_job_processed_at = common.get_current_datetime()
|
|
409
|
+
|
|
410
|
+
await services.unregister_replica(session, job_model)
|
|
411
|
+
if all_volumes_detached:
|
|
412
|
+
# Do not terminate while some volumes are not detached.
|
|
413
|
+
_set_job_termination_status(session, job_model)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
async def process_volumes_detaching(
|
|
417
|
+
session: AsyncSession,
|
|
418
|
+
job_model: JobModel,
|
|
419
|
+
instance_model: InstanceModel,
|
|
420
|
+
):
|
|
421
|
+
"""
|
|
422
|
+
Called after job's volumes have been soft detached to check if they are detached.
|
|
423
|
+
Terminates the job when all the volumes are detached.
|
|
424
|
+
If the volumes fail to detach, force detaches them.
|
|
425
|
+
"""
|
|
426
|
+
jpd = get_or_error(get_job_provisioning_data(job_model))
|
|
427
|
+
jrd = get_job_runtime_data(job_model)
|
|
428
|
+
if jrd is not None and jrd.volume_names is not None:
|
|
429
|
+
volume_names = jrd.volume_names
|
|
430
|
+
else:
|
|
431
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
432
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
433
|
+
volume_models = await list_project_volume_models(
|
|
434
|
+
session=session, project=instance_model.project, names=volume_names
|
|
435
|
+
)
|
|
436
|
+
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
437
|
+
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
438
|
+
project=instance_model.project,
|
|
439
|
+
job_model=job_model,
|
|
440
|
+
jpd=jpd,
|
|
441
|
+
instance_model=instance_model,
|
|
442
|
+
volume_models=volume_models,
|
|
443
|
+
)
|
|
444
|
+
if all_volumes_detached:
|
|
445
|
+
# Do not terminate the job while some volumes are not detached.
|
|
446
|
+
# If force detach never succeeds, the job will be stuck terminating.
|
|
447
|
+
# The job releases the instance when soft detaching, so the instance won't be stuck.
|
|
448
|
+
_set_job_termination_status(session, job_model)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _set_job_termination_status(session: AsyncSession, job_model: JobModel):
|
|
452
|
+
if job_model.termination_reason is not None:
|
|
453
|
+
status = job_model.termination_reason.to_status()
|
|
454
|
+
else:
|
|
455
|
+
status = JobStatus.FAILED
|
|
456
|
+
switch_job_status(session, job_model, status)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
async def stop_container(
|
|
460
|
+
job_model: JobModel,
|
|
461
|
+
job_provisioning_data: JobProvisioningData,
|
|
462
|
+
ssh_private_keys: tuple[str, Optional[str]],
|
|
463
|
+
) -> bool:
|
|
464
|
+
if job_provisioning_data.dockerized:
|
|
465
|
+
# send a request to the shim to terminate the docker container
|
|
466
|
+
# SSHError and RequestException are caught in the `runner_ssh_tunner` decorator
|
|
467
|
+
return await run_async(
|
|
468
|
+
_shim_submit_stop,
|
|
469
|
+
ssh_private_keys,
|
|
470
|
+
job_provisioning_data,
|
|
471
|
+
None,
|
|
472
|
+
job_model,
|
|
473
|
+
)
|
|
474
|
+
return True
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT])
|
|
478
|
+
def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel) -> bool:
|
|
479
|
+
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
480
|
+
|
|
481
|
+
resp = shim_client.healthcheck()
|
|
482
|
+
if resp is None:
|
|
483
|
+
logger.debug("%s: can't stop container, shim is not available yet", fmt(job_model))
|
|
484
|
+
return False # shim is not available yet
|
|
485
|
+
|
|
486
|
+
# we force-kill container because the runner had time to gracefully stop the job
|
|
487
|
+
if shim_client.is_api_v2_supported():
|
|
488
|
+
if job_model.termination_reason is None:
|
|
489
|
+
reason = None
|
|
490
|
+
else:
|
|
491
|
+
reason = job_model.termination_reason.value
|
|
492
|
+
shim_client.terminate_task(
|
|
493
|
+
task_id=job_model.id,
|
|
494
|
+
reason=reason,
|
|
495
|
+
message=job_model.termination_reason_message,
|
|
496
|
+
timeout=0,
|
|
497
|
+
)
|
|
498
|
+
# maybe somehow postpone removing old tasks to allow inspecting failed jobs without
|
|
499
|
+
# the following setting?
|
|
500
|
+
if not settings.SERVER_KEEP_SHIM_TASKS:
|
|
501
|
+
shim_client.remove_task(task_id=job_model.id)
|
|
502
|
+
else:
|
|
503
|
+
shim_client.stop(force=True)
|
|
504
|
+
return True
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, List[JobModel]]]:
|
|
508
|
+
"""
|
|
509
|
+
Args:
|
|
510
|
+
jobs: unsorted list of jobs
|
|
511
|
+
|
|
512
|
+
Yields:
|
|
513
|
+
latest jobs in each replica (replica_num, jobs)
|
|
514
|
+
"""
|
|
515
|
+
jobs = sorted(jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num))
|
|
516
|
+
for replica_num, all_replica_jobs in itertools.groupby(jobs, key=lambda j: j.replica_num):
|
|
517
|
+
replica_jobs: List[JobModel] = []
|
|
518
|
+
for job_num, job_submissions in itertools.groupby(
|
|
519
|
+
all_replica_jobs, key=lambda j: j.job_num
|
|
520
|
+
):
|
|
521
|
+
# take only the latest submission
|
|
522
|
+
# the latest `submission_num` doesn't have to be the same for all jobs
|
|
523
|
+
*_, latest_job_submission = job_submissions
|
|
524
|
+
replica_jobs.append(latest_job_submission)
|
|
525
|
+
yield replica_num, replica_jobs
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
async def _detach_volumes_from_job_instance(
|
|
529
|
+
project: ProjectModel,
|
|
530
|
+
job_model: JobModel,
|
|
531
|
+
jpd: JobProvisioningData,
|
|
532
|
+
instance_model: InstanceModel,
|
|
533
|
+
volume_models: list[VolumeModel],
|
|
534
|
+
) -> bool:
|
|
535
|
+
job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
|
|
536
|
+
backend = await backends_services.get_project_backend_by_type(
|
|
537
|
+
project=project,
|
|
538
|
+
backend_type=jpd.backend,
|
|
539
|
+
)
|
|
540
|
+
if backend is None:
|
|
541
|
+
logger.error(
|
|
542
|
+
"Failed to detach volumes from %s. Backend not available.", instance_model.name
|
|
543
|
+
)
|
|
544
|
+
return False
|
|
545
|
+
|
|
546
|
+
all_detached = True
|
|
547
|
+
detached_volumes = []
|
|
548
|
+
for volume_model in volume_models:
|
|
549
|
+
detached = await _detach_volume_from_job_instance(
|
|
550
|
+
backend=backend,
|
|
551
|
+
job_model=job_model,
|
|
552
|
+
jpd=jpd,
|
|
553
|
+
job_spec=job_spec,
|
|
554
|
+
instance_model=instance_model,
|
|
555
|
+
volume_model=volume_model,
|
|
556
|
+
)
|
|
557
|
+
if detached:
|
|
558
|
+
detached_volumes.append(volume_model)
|
|
559
|
+
else:
|
|
560
|
+
all_detached = False
|
|
561
|
+
|
|
562
|
+
if job_model.volumes_detached_at is None:
|
|
563
|
+
job_model.volumes_detached_at = common.get_current_datetime()
|
|
564
|
+
detached_volumes_ids = {v.id for v in detached_volumes}
|
|
565
|
+
instance_model.volume_attachments = [
|
|
566
|
+
va for va in instance_model.volume_attachments if va.volume_id not in detached_volumes_ids
|
|
567
|
+
]
|
|
568
|
+
return all_detached
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
async def _detach_volume_from_job_instance(
|
|
572
|
+
backend: Backend,
|
|
573
|
+
job_model: JobModel,
|
|
574
|
+
jpd: JobProvisioningData,
|
|
575
|
+
job_spec: JobSpec,
|
|
576
|
+
instance_model: InstanceModel,
|
|
577
|
+
volume_model: VolumeModel,
|
|
578
|
+
) -> bool:
|
|
579
|
+
detached = True
|
|
580
|
+
volume = volume_model_to_volume(volume_model)
|
|
581
|
+
if volume.provisioning_data is None or not volume.provisioning_data.detachable:
|
|
582
|
+
# Backends without `detach_volume` detach volumes automatically
|
|
583
|
+
return detached
|
|
584
|
+
compute = backend.compute()
|
|
585
|
+
assert isinstance(compute, ComputeWithVolumeSupport)
|
|
586
|
+
try:
|
|
587
|
+
if job_model.volumes_detached_at is None:
|
|
588
|
+
# We haven't tried detaching volumes yet, try soft detach first
|
|
589
|
+
await run_async(
|
|
590
|
+
compute.detach_volume,
|
|
591
|
+
volume=volume,
|
|
592
|
+
provisioning_data=jpd,
|
|
593
|
+
force=False,
|
|
594
|
+
)
|
|
595
|
+
# For some backends, the volume may be detached immediately
|
|
596
|
+
detached = await run_async(
|
|
597
|
+
compute.is_volume_detached,
|
|
598
|
+
volume=volume,
|
|
599
|
+
provisioning_data=jpd,
|
|
600
|
+
)
|
|
601
|
+
else:
|
|
602
|
+
detached = await run_async(
|
|
603
|
+
compute.is_volume_detached,
|
|
604
|
+
volume=volume,
|
|
605
|
+
provisioning_data=jpd,
|
|
606
|
+
)
|
|
607
|
+
if not detached and _should_force_detach_volume(job_model, job_spec.stop_duration):
|
|
608
|
+
logger.info(
|
|
609
|
+
"Force detaching volume %s from %s",
|
|
610
|
+
volume_model.name,
|
|
611
|
+
instance_model.name,
|
|
612
|
+
)
|
|
613
|
+
await run_async(
|
|
614
|
+
compute.detach_volume,
|
|
615
|
+
volume=volume,
|
|
616
|
+
provisioning_data=jpd,
|
|
617
|
+
force=True,
|
|
618
|
+
)
|
|
619
|
+
# Let the next iteration check if force detach worked
|
|
620
|
+
except BackendError as e:
|
|
621
|
+
logger.error(
|
|
622
|
+
"Failed to detach volume %s from %s: %s",
|
|
623
|
+
volume_model.name,
|
|
624
|
+
instance_model.name,
|
|
625
|
+
repr(e),
|
|
626
|
+
)
|
|
627
|
+
except Exception:
|
|
628
|
+
logger.exception(
|
|
629
|
+
"Got exception when detaching volume %s from instance %s",
|
|
630
|
+
volume_model.name,
|
|
631
|
+
instance_model.name,
|
|
632
|
+
)
|
|
633
|
+
return detached
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
MIN_FORCE_DETACH_WAIT_PERIOD = timedelta(seconds=60)
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int]) -> bool:
|
|
640
|
+
return (
|
|
641
|
+
job_model.volumes_detached_at is not None
|
|
642
|
+
and common.get_current_datetime()
|
|
643
|
+
> job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
|
|
644
|
+
and (
|
|
645
|
+
job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
|
|
646
|
+
or stop_duration is not None
|
|
647
|
+
and common.get_current_datetime()
|
|
648
|
+
> job_model.volumes_detached_at + timedelta(seconds=stop_duration)
|
|
649
|
+
)
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
|
|
654
|
+
res = await session.execute(
|
|
655
|
+
select(JobModel)
|
|
656
|
+
.where(
|
|
657
|
+
JobModel.status == JobStatus.TERMINATING,
|
|
658
|
+
JobModel.used_instance_id.is_not(None),
|
|
659
|
+
JobModel.volumes_detached_at.is_not(None),
|
|
660
|
+
)
|
|
661
|
+
.options(load_only(JobModel.used_instance_id))
|
|
662
|
+
)
|
|
663
|
+
job_models = res.scalars().all()
|
|
664
|
+
return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
async def get_job_configured_volumes(
|
|
668
|
+
session: AsyncSession,
|
|
669
|
+
project: ProjectModel,
|
|
670
|
+
run_spec: RunSpec,
|
|
671
|
+
job_num: int,
|
|
672
|
+
job_spec: Optional[JobSpec] = None,
|
|
673
|
+
) -> List[List[Volume]]:
|
|
674
|
+
"""
|
|
675
|
+
Returns a list of job volumes grouped by mount points.
|
|
676
|
+
"""
|
|
677
|
+
volume_models = await get_job_configured_volume_models(
|
|
678
|
+
session=session,
|
|
679
|
+
project=project,
|
|
680
|
+
run_spec=run_spec,
|
|
681
|
+
job_num=job_num,
|
|
682
|
+
job_spec=job_spec,
|
|
683
|
+
)
|
|
684
|
+
return [
|
|
685
|
+
[volumes_services.volume_model_to_volume(v) for v in mount_point_volume_models]
|
|
686
|
+
for mount_point_volume_models in volume_models
|
|
687
|
+
]
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
async def get_job_configured_volume_models(
|
|
691
|
+
session: AsyncSession,
|
|
692
|
+
project: ProjectModel,
|
|
693
|
+
run_spec: RunSpec,
|
|
694
|
+
job_num: int,
|
|
695
|
+
job_spec: Optional[JobSpec] = None,
|
|
696
|
+
) -> List[List[VolumeModel]]:
|
|
697
|
+
"""
|
|
698
|
+
Returns a list of job volume models grouped by mount points.
|
|
699
|
+
"""
|
|
700
|
+
job_volumes = None
|
|
701
|
+
if job_spec is not None:
|
|
702
|
+
job_volumes = job_spec.volumes
|
|
703
|
+
if job_volumes is None:
|
|
704
|
+
# job_spec not provided or a legacy job_spec without volumes
|
|
705
|
+
job_volumes = interpolate_job_volumes(run_spec.configuration.volumes, job_num)
|
|
706
|
+
volume_models = []
|
|
707
|
+
for mount_point in job_volumes:
|
|
708
|
+
if not isinstance(mount_point, VolumeMountPoint):
|
|
709
|
+
continue
|
|
710
|
+
if isinstance(mount_point.name, str):
|
|
711
|
+
names = [mount_point.name]
|
|
712
|
+
else:
|
|
713
|
+
names = mount_point.name
|
|
714
|
+
mount_point_volume_models = []
|
|
715
|
+
for name in names:
|
|
716
|
+
volume_model = await volumes_services.get_project_volume_model_by_name(
|
|
717
|
+
session=session,
|
|
718
|
+
project=project,
|
|
719
|
+
name=name,
|
|
720
|
+
)
|
|
721
|
+
if volume_model is None:
|
|
722
|
+
raise ResourceNotExistsError(f"Volume {mount_point.name} not found")
|
|
723
|
+
mount_point_volume_models.append(volume_model)
|
|
724
|
+
volume_models.append(mount_point_volume_models)
|
|
725
|
+
return volume_models
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def check_can_attach_job_volumes(volumes: List[List[Volume]]):
|
|
729
|
+
"""
|
|
730
|
+
Performs basic checks if volumes can be attached.
|
|
731
|
+
This is useful to show error ASAP (when user submits the run).
|
|
732
|
+
If the attachment is to fail anyway, the error will be handled when proccessing submitted jobs.
|
|
733
|
+
"""
|
|
734
|
+
if len(volumes) == 0:
|
|
735
|
+
return
|
|
736
|
+
expected_backends = {v.configuration.backend for v in volumes[0]}
|
|
737
|
+
expected_regions = {v.configuration.region for v in volumes[0]}
|
|
738
|
+
for mount_point_volumes in volumes:
|
|
739
|
+
backends = {v.configuration.backend for v in mount_point_volumes}
|
|
740
|
+
regions = {v.configuration.region for v in mount_point_volumes}
|
|
741
|
+
if backends != expected_backends:
|
|
742
|
+
raise ServerClientError(
|
|
743
|
+
"Volumes from different backends specified for different mount points"
|
|
744
|
+
)
|
|
745
|
+
if regions != expected_regions:
|
|
746
|
+
raise ServerClientError(
|
|
747
|
+
"Volumes from different regions specified for different mount points"
|
|
748
|
+
)
|
|
749
|
+
for volume in mount_point_volumes:
|
|
750
|
+
if volume.status != VolumeStatus.ACTIVE:
|
|
751
|
+
raise ServerClientError(f"Cannot mount volumes that are not active: {volume.name}")
|
|
752
|
+
volumes_names = [v.name for vs in volumes for v in vs]
|
|
753
|
+
if len(volumes_names) != len(set(volumes_names)):
|
|
754
|
+
raise ServerClientError("Cannot attach the same volume at different mount points")
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
async def get_job_attached_volumes(
|
|
758
|
+
session: AsyncSession,
|
|
759
|
+
project: ProjectModel,
|
|
760
|
+
run_spec: RunSpec,
|
|
761
|
+
job_num: int,
|
|
762
|
+
job_provisioning_data: JobProvisioningData,
|
|
763
|
+
) -> List[Volume]:
|
|
764
|
+
"""
|
|
765
|
+
Returns volumes attached to the job.
|
|
766
|
+
"""
|
|
767
|
+
job_configured_volumes = await get_job_configured_volumes(
|
|
768
|
+
session=session,
|
|
769
|
+
project=project,
|
|
770
|
+
run_spec=run_spec,
|
|
771
|
+
job_num=job_num,
|
|
772
|
+
)
|
|
773
|
+
job_volumes = []
|
|
774
|
+
for mount_point_volumes in job_configured_volumes:
|
|
775
|
+
job_volumes.append(
|
|
776
|
+
_get_job_mount_point_attached_volume(mount_point_volumes, job_provisioning_data)
|
|
777
|
+
)
|
|
778
|
+
return job_volumes
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def remove_job_spec_sensitive_info(spec: JobSpec):
|
|
782
|
+
spec.ssh_key = None
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
def _get_job_mount_point_attached_volume(
|
|
786
|
+
volumes: List[Volume],
|
|
787
|
+
job_provisioning_data: JobProvisioningData,
|
|
788
|
+
) -> Volume:
|
|
789
|
+
"""
|
|
790
|
+
Returns the volume attached to the job among the list of possible mount point volumes.
|
|
791
|
+
"""
|
|
792
|
+
for volume in volumes:
|
|
793
|
+
if (
|
|
794
|
+
volume.configuration.backend != job_provisioning_data.get_base_backend()
|
|
795
|
+
or volume.configuration.region.lower() != job_provisioning_data.region.lower()
|
|
796
|
+
):
|
|
797
|
+
continue
|
|
798
|
+
if (
|
|
799
|
+
volume.provisioning_data is not None
|
|
800
|
+
and volume.provisioning_data.availability_zone is not None
|
|
801
|
+
and job_provisioning_data.availability_zone is not None
|
|
802
|
+
and volume.provisioning_data.availability_zone.lower()
|
|
803
|
+
!= job_provisioning_data.availability_zone.lower()
|
|
804
|
+
):
|
|
805
|
+
continue
|
|
806
|
+
return volume
|
|
807
|
+
raise ServerClientError("Failed to find an eligible volume for the mount point")
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def _get_job_status_message(job_model: JobModel) -> str:
|
|
811
|
+
if job_model.status == JobStatus.DONE:
|
|
812
|
+
return "exited (0)"
|
|
813
|
+
elif job_model.status == JobStatus.FAILED:
|
|
814
|
+
if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
815
|
+
return f"exited ({job_model.exit_status})"
|
|
816
|
+
elif (
|
|
817
|
+
job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
818
|
+
):
|
|
819
|
+
if (
|
|
820
|
+
job_model.termination_reason_message
|
|
821
|
+
and "No matching fleet found" in job_model.termination_reason_message
|
|
822
|
+
):
|
|
823
|
+
return "no fleets"
|
|
824
|
+
return "no offers"
|
|
825
|
+
elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
826
|
+
return "interrupted"
|
|
827
|
+
else:
|
|
828
|
+
return "error"
|
|
829
|
+
elif job_model.status == JobStatus.TERMINATED:
|
|
830
|
+
if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
831
|
+
return "stopped"
|
|
832
|
+
elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
833
|
+
return "aborted"
|
|
834
|
+
return job_model.status.value
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def _get_job_error(job_model: JobModel) -> Optional[str]:
|
|
838
|
+
if job_model.termination_reason is None:
|
|
839
|
+
return None
|
|
840
|
+
return job_model.termination_reason.to_error()
|