dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +80 -0
- dstack/_internal/cli/commands/apply.py +100 -0
- dstack/_internal/cli/commands/attach.py +161 -0
- dstack/_internal/cli/commands/completion.py +22 -0
- dstack/_internal/cli/commands/delete.py +44 -0
- dstack/_internal/cli/commands/event.py +168 -0
- dstack/_internal/cli/commands/fleet.py +161 -0
- dstack/_internal/cli/commands/gateway.py +159 -0
- dstack/_internal/cli/commands/init.py +64 -0
- dstack/_internal/cli/commands/login.py +352 -0
- dstack/_internal/cli/commands/logs.py +62 -0
- dstack/_internal/cli/commands/metrics.py +153 -0
- dstack/_internal/cli/commands/offer.py +146 -0
- dstack/_internal/cli/commands/project.py +259 -0
- dstack/_internal/cli/commands/ps.py +81 -0
- dstack/_internal/cli/commands/run.py +69 -0
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/commands/server.py +96 -0
- dstack/_internal/cli/commands/stop.py +26 -0
- dstack/_internal/cli/commands/volume.py +117 -0
- dstack/_internal/cli/main.py +101 -0
- dstack/_internal/cli/models/gateways.py +16 -0
- dstack/_internal/cli/models/offers.py +47 -0
- dstack/_internal/cli/models/runs.py +16 -0
- dstack/_internal/cli/services/args.py +31 -0
- dstack/_internal/cli/services/completion.py +91 -0
- dstack/_internal/cli/services/configurators/__init__.py +86 -0
- dstack/_internal/cli/services/configurators/base.py +103 -0
- dstack/_internal/cli/services/configurators/fleet.py +475 -0
- dstack/_internal/cli/services/configurators/gateway.py +231 -0
- dstack/_internal/cli/services/configurators/run.py +882 -0
- dstack/_internal/cli/services/configurators/volume.py +222 -0
- dstack/_internal/cli/services/events.py +68 -0
- dstack/_internal/cli/services/profile.py +182 -0
- dstack/_internal/cli/services/repos.py +71 -0
- dstack/_internal/cli/services/resources.py +54 -0
- dstack/_internal/cli/utils/common.py +159 -0
- dstack/_internal/cli/utils/fleet.py +106 -0
- dstack/_internal/cli/utils/gateway.py +56 -0
- dstack/_internal/cli/utils/gpu.py +178 -0
- dstack/_internal/cli/utils/rich.py +156 -0
- dstack/_internal/cli/utils/run.py +517 -0
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/cli/utils/updates.py +98 -0
- dstack/_internal/cli/utils/volume.py +58 -0
- dstack/_internal/compat.py +3 -0
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/auth.py +30 -0
- dstack/_internal/core/backends/aws/backend.py +31 -0
- dstack/_internal/core/backends/aws/compute.py +1153 -0
- dstack/_internal/core/backends/aws/configurator.py +191 -0
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +700 -0
- dstack/_internal/core/backends/azure/auth.py +39 -0
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +676 -0
- dstack/_internal/core/backends/azure/configurator.py +472 -0
- dstack/_internal/core/backends/azure/models.py +98 -0
- dstack/_internal/core/backends/azure/resources.py +116 -0
- dstack/_internal/core/backends/azure/utils.py +42 -0
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +1101 -0
- dstack/_internal/core/backends/base/configurator.py +117 -0
- dstack/_internal/core/backends/base/models.py +24 -0
- dstack/_internal/core/backends/base/offers.py +232 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +181 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -0
- dstack/_internal/core/backends/cudo/api_client.py +111 -0
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +174 -0
- dstack/_internal/core/backends/cudo/configurator.py +63 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
- dstack/_internal/core/backends/datacrunch/backend.py +18 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -0
- dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/dstack/__init__.py +0 -0
- dstack/_internal/core/backends/dstack/models.py +26 -0
- dstack/_internal/core/backends/features.py +74 -0
- dstack/_internal/core/backends/gcp/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/auth.py +57 -0
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +1257 -0
- dstack/_internal/core/backends/gcp/configurator.py +206 -0
- dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
- dstack/_internal/core/backends/gcp/models.py +160 -0
- dstack/_internal/core/backends/gcp/resources.py +585 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +188 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
- dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
- dstack/_internal/core/backends/kubernetes/models.py +71 -0
- dstack/_internal/core/backends/kubernetes/utils.py +81 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
- dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
- dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -0
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +130 -0
- dstack/_internal/core/backends/models.py +158 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +401 -0
- dstack/_internal/core/backends/nebius/configurator.py +98 -0
- dstack/_internal/core/backends/nebius/models.py +185 -0
- dstack/_internal/core/backends/nebius/resources.py +433 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -0
- dstack/_internal/core/backends/oci/auth.py +21 -0
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +209 -0
- dstack/_internal/core/backends/oci/configurator.py +156 -0
- dstack/_internal/core/backends/oci/exceptions.py +15 -0
- dstack/_internal/core/backends/oci/models.py +87 -0
- dstack/_internal/core/backends/oci/region.py +86 -0
- dstack/_internal/core/backends/oci/resources.py +836 -0
- dstack/_internal/core/backends/runpod/__init__.py +0 -0
- dstack/_internal/core/backends/runpod/api_client.py +627 -0
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +444 -0
- dstack/_internal/core/backends/runpod/configurator.py +63 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/template/backend.py.jinja +16 -0
- dstack/_internal/core/backends/template/compute.py.jinja +95 -0
- dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
- dstack/_internal/core/backends/template/models.py.jinja +62 -0
- dstack/_internal/core/backends/tensordock/models.py +40 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -0
- dstack/_internal/core/backends/vastai/api_client.py +143 -0
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +141 -0
- dstack/_internal/core/backends/vastai/configurator.py +69 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/verda/__init__.py +0 -0
- dstack/_internal/core/backends/verda/backend.py +16 -0
- dstack/_internal/core/backends/verda/compute.py +266 -0
- dstack/_internal/core/backends/verda/configurator.py +73 -0
- dstack/_internal/core/backends/verda/models.py +38 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -0
- dstack/_internal/core/backends/vultr/api_client.py +116 -0
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +167 -0
- dstack/_internal/core/backends/vultr/configurator.py +71 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/events.py +13 -0
- dstack/_internal/core/compatibility/fleets.py +58 -0
- dstack/_internal/core/compatibility/gateways.py +39 -0
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/logs.py +14 -0
- dstack/_internal/core/compatibility/runs.py +86 -0
- dstack/_internal/core/compatibility/volumes.py +37 -0
- dstack/_internal/core/consts.py +8 -0
- dstack/_internal/core/errors.py +160 -0
- dstack/_internal/core/models/__init__.py +0 -0
- dstack/_internal/core/models/auth.py +28 -0
- dstack/_internal/core/models/backends/__init__.py +0 -0
- dstack/_internal/core/models/backends/base.py +48 -0
- dstack/_internal/core/models/common.py +143 -0
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/config.py +28 -0
- dstack/_internal/core/models/configurations.py +1123 -0
- dstack/_internal/core/models/envs.py +149 -0
- dstack/_internal/core/models/events.py +98 -0
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/fleets.py +437 -0
- dstack/_internal/core/models/gateways.py +146 -0
- dstack/_internal/core/models/gpus.py +45 -0
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +346 -0
- dstack/_internal/core/models/logs.py +27 -0
- dstack/_internal/core/models/metrics.py +14 -0
- dstack/_internal/core/models/placement.py +27 -0
- dstack/_internal/core/models/profiles.py +431 -0
- dstack/_internal/core/models/projects.py +46 -0
- dstack/_internal/core/models/repos/__init__.py +34 -0
- dstack/_internal/core/models/repos/base.py +36 -0
- dstack/_internal/core/models/repos/local.py +96 -0
- dstack/_internal/core/models/repos/remote.py +341 -0
- dstack/_internal/core/models/repos/virtual.py +85 -0
- dstack/_internal/core/models/resources.py +424 -0
- dstack/_internal/core/models/routers.py +24 -0
- dstack/_internal/core/models/runs.py +618 -0
- dstack/_internal/core/models/secrets.py +16 -0
- dstack/_internal/core/models/server.py +7 -0
- dstack/_internal/core/models/services.py +76 -0
- dstack/_internal/core/models/unix.py +53 -0
- dstack/_internal/core/models/users.py +60 -0
- dstack/_internal/core/models/volumes.py +221 -0
- dstack/_internal/core/services/__init__.py +16 -0
- dstack/_internal/core/services/api_client.py +15 -0
- dstack/_internal/core/services/configs/__init__.py +116 -0
- dstack/_internal/core/services/diff.py +71 -0
- dstack/_internal/core/services/logs.py +58 -0
- dstack/_internal/core/services/profiles.py +46 -0
- dstack/_internal/core/services/repos.py +236 -0
- dstack/_internal/core/services/ssh/__init__.py +27 -0
- dstack/_internal/core/services/ssh/attach.py +241 -0
- dstack/_internal/core/services/ssh/client.py +113 -0
- dstack/_internal/core/services/ssh/key_manager.py +53 -0
- dstack/_internal/core/services/ssh/ports.py +89 -0
- dstack/_internal/core/services/ssh/tunnel.py +337 -0
- dstack/_internal/proxy/__init__.py +8 -0
- dstack/_internal/proxy/gateway/__init__.py +0 -0
- dstack/_internal/proxy/gateway/app.py +89 -0
- dstack/_internal/proxy/gateway/auth.py +26 -0
- dstack/_internal/proxy/gateway/const.py +7 -0
- dstack/_internal/proxy/gateway/deps.py +73 -0
- dstack/_internal/proxy/gateway/main.py +17 -0
- dstack/_internal/proxy/gateway/models.py +23 -0
- dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
- dstack/_internal/proxy/gateway/repo/repo.py +121 -0
- dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
- dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
- dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
- dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
- dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
- dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
- dstack/_internal/proxy/gateway/routers/auth.py +10 -0
- dstack/_internal/proxy/gateway/routers/config.py +28 -0
- dstack/_internal/proxy/gateway/routers/registry.py +124 -0
- dstack/_internal/proxy/gateway/routers/stats.py +18 -0
- dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
- dstack/_internal/proxy/gateway/schemas/common.py +5 -0
- dstack/_internal/proxy/gateway/schemas/config.py +9 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
- dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
- dstack/_internal/proxy/gateway/services/__init__.py +0 -0
- dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
- dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
- dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
- dstack/_internal/proxy/gateway/services/nginx.py +455 -0
- dstack/_internal/proxy/gateway/services/registry.py +426 -0
- dstack/_internal/proxy/gateway/services/server_client.py +95 -0
- dstack/_internal/proxy/gateway/services/stats.py +170 -0
- dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
- dstack/_internal/proxy/gateway/testing/common.py +13 -0
- dstack/_internal/proxy/lib/__init__.py +0 -0
- dstack/_internal/proxy/lib/auth.py +7 -0
- dstack/_internal/proxy/lib/deps.py +106 -0
- dstack/_internal/proxy/lib/errors.py +14 -0
- dstack/_internal/proxy/lib/models.py +112 -0
- dstack/_internal/proxy/lib/repo.py +27 -0
- dstack/_internal/proxy/lib/routers/__init__.py +0 -0
- dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
- dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
- dstack/_internal/proxy/lib/services/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
- dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
- dstack/_internal/proxy/lib/services/service_connection.py +160 -0
- dstack/_internal/proxy/lib/testing/__init__.py +0 -0
- dstack/_internal/proxy/lib/testing/auth.py +11 -0
- dstack/_internal/proxy/lib/testing/common.py +51 -0
- dstack/_internal/server/__init__.py +0 -0
- dstack/_internal/server/alembic.ini +100 -0
- dstack/_internal/server/app.py +432 -0
- dstack/_internal/server/background/__init__.py +142 -0
- dstack/_internal/server/background/tasks/__init__.py +0 -0
- dstack/_internal/server/background/tasks/common.py +24 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
- dstack/_internal/server/background/tasks/process_events.py +17 -0
- dstack/_internal/server/background/tasks/process_fleets.py +289 -0
- dstack/_internal/server/background/tasks/process_gateways.py +188 -0
- dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
- dstack/_internal/server/background/tasks/process_instances.py +1186 -0
- dstack/_internal/server/background/tasks/process_metrics.py +172 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
- dstack/_internal/server/background/tasks/process_runs.py +842 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
- dstack/_internal/server/background/tasks/process_volumes.py +129 -0
- dstack/_internal/server/compatibility/__init__.py +0 -0
- dstack/_internal/server/compatibility/common.py +20 -0
- dstack/_internal/server/compatibility/gpus.py +22 -0
- dstack/_internal/server/db.py +127 -0
- dstack/_internal/server/deps.py +19 -0
- dstack/_internal/server/main.py +4 -0
- dstack/_internal/server/migrations/__init__.py +0 -0
- dstack/_internal/server/migrations/env.py +112 -0
- dstack/_internal/server/migrations/script.py.mako +28 -0
- dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
- dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
- dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
- dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
- dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
- dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
- dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
- dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
- dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
- dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
- dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
- dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
- dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
- dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
- dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
- dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
- dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
- dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
- dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
- dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
- dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
- dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
- dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
- dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
- dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
- dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
- dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
- dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
- dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
- dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
- dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
- dstack/_internal/server/migrations/versions/__init__.py +0 -0
- dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
- dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
- dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
- dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
- dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
- dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
- dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
- dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
- dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
- dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
- dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
- dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
- dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
- dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
- dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
- dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
- dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
- dstack/_internal/server/models.py +930 -0
- dstack/_internal/server/routers/__init__.py +0 -0
- dstack/_internal/server/routers/auth.py +34 -0
- dstack/_internal/server/routers/backends.py +142 -0
- dstack/_internal/server/routers/events.py +60 -0
- dstack/_internal/server/routers/files.py +68 -0
- dstack/_internal/server/routers/fleets.py +202 -0
- dstack/_internal/server/routers/gateways.py +109 -0
- dstack/_internal/server/routers/gpus.py +32 -0
- dstack/_internal/server/routers/instances.py +77 -0
- dstack/_internal/server/routers/logs.py +34 -0
- dstack/_internal/server/routers/metrics.py +82 -0
- dstack/_internal/server/routers/projects.py +205 -0
- dstack/_internal/server/routers/prometheus.py +35 -0
- dstack/_internal/server/routers/repos.py +118 -0
- dstack/_internal/server/routers/runs.py +216 -0
- dstack/_internal/server/routers/secrets.py +86 -0
- dstack/_internal/server/routers/server.py +19 -0
- dstack/_internal/server/routers/users.py +158 -0
- dstack/_internal/server/routers/volumes.py +122 -0
- dstack/_internal/server/schemas/__init__.py +0 -0
- dstack/_internal/server/schemas/auth.py +83 -0
- dstack/_internal/server/schemas/backends.py +16 -0
- dstack/_internal/server/schemas/common.py +9 -0
- dstack/_internal/server/schemas/events.py +211 -0
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/fleets.py +49 -0
- dstack/_internal/server/schemas/gateways.py +31 -0
- dstack/_internal/server/schemas/gpus.py +26 -0
- dstack/_internal/server/schemas/health/__init__.py +0 -0
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +47 -0
- dstack/_internal/server/schemas/logs.py +17 -0
- dstack/_internal/server/schemas/projects.py +81 -0
- dstack/_internal/server/schemas/repos.py +24 -0
- dstack/_internal/server/schemas/runner.py +269 -0
- dstack/_internal/server/schemas/runs.py +66 -0
- dstack/_internal/server/schemas/secrets.py +16 -0
- dstack/_internal/server/schemas/users.py +72 -0
- dstack/_internal/server/schemas/volumes.py +29 -0
- dstack/_internal/server/security/__init__.py +0 -0
- dstack/_internal/server/security/permissions.py +251 -0
- dstack/_internal/server/services/__init__.py +0 -0
- dstack/_internal/server/services/auth.py +77 -0
- dstack/_internal/server/services/backends/__init__.py +404 -0
- dstack/_internal/server/services/backends/handlers.py +105 -0
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/config.py +279 -0
- dstack/_internal/server/services/docker.py +162 -0
- dstack/_internal/server/services/encryption/__init__.py +102 -0
- dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
- dstack/_internal/server/services/encryption/keys/aes.py +68 -0
- dstack/_internal/server/services/encryption/keys/base.py +19 -0
- dstack/_internal/server/services/encryption/keys/identity.py +28 -0
- dstack/_internal/server/services/events.py +477 -0
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1224 -0
- dstack/_internal/server/services/gateways/__init__.py +686 -0
- dstack/_internal/server/services/gateways/client.py +209 -0
- dstack/_internal/server/services/gateways/connection.py +139 -0
- dstack/_internal/server/services/gateways/pool.py +58 -0
- dstack/_internal/server/services/gpus.py +387 -0
- dstack/_internal/server/services/instances.py +731 -0
- dstack/_internal/server/services/jobs/__init__.py +840 -0
- dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/base.py +469 -0
- dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
- dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
- dstack/_internal/server/services/jobs/configurators/service.py +28 -0
- dstack/_internal/server/services/jobs/configurators/task.py +39 -0
- dstack/_internal/server/services/locking.py +187 -0
- dstack/_internal/server/services/logging.py +29 -0
- dstack/_internal/server/services/logs/__init__.py +122 -0
- dstack/_internal/server/services/logs/aws.py +373 -0
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +261 -0
- dstack/_internal/server/services/logs/fluentbit.py +329 -0
- dstack/_internal/server/services/logs/gcp.py +181 -0
- dstack/_internal/server/services/metrics.py +172 -0
- dstack/_internal/server/services/offers.py +249 -0
- dstack/_internal/server/services/permissions.py +37 -0
- dstack/_internal/server/services/placement.py +234 -0
- dstack/_internal/server/services/plugins.py +109 -0
- dstack/_internal/server/services/probes.py +10 -0
- dstack/_internal/server/services/projects.py +835 -0
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
- dstack/_internal/server/services/proxy/__init__.py +3 -0
- dstack/_internal/server/services/proxy/auth.py +12 -0
- dstack/_internal/server/services/proxy/deps.py +18 -0
- dstack/_internal/server/services/proxy/repo.py +189 -0
- dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
- dstack/_internal/server/services/proxy/services/__init__.py +0 -0
- dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
- dstack/_internal/server/services/repos.py +362 -0
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +260 -0
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/__init__.py +0 -0
- dstack/_internal/server/services/runner/client.py +646 -0
- dstack/_internal/server/services/runner/ssh.py +128 -0
- dstack/_internal/server/services/runs/__init__.py +1026 -0
- dstack/_internal/server/services/runs/plan.py +703 -0
- dstack/_internal/server/services/runs/replicas.py +317 -0
- dstack/_internal/server/services/runs/spec.py +191 -0
- dstack/_internal/server/services/secrets.py +245 -0
- dstack/_internal/server/services/services/__init__.py +345 -0
- dstack/_internal/server/services/services/autoscalers.py +140 -0
- dstack/_internal/server/services/services/options.py +53 -0
- dstack/_internal/server/services/ssh.py +67 -0
- dstack/_internal/server/services/storage/__init__.py +37 -0
- dstack/_internal/server/services/storage/base.py +48 -0
- dstack/_internal/server/services/storage/gcs.py +66 -0
- dstack/_internal/server/services/storage/s3.py +69 -0
- dstack/_internal/server/services/users.py +461 -0
- dstack/_internal/server/services/volumes.py +496 -0
- dstack/_internal/server/settings.py +161 -0
- dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
- dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
- dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
- dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
- dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
- dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
- dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
- dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
- dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
- dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
- dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
- dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
- dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
- dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
- dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
- dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
- dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
- dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
- dstack/_internal/server/statics/assets/favicon.ico +0 -0
- dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
- dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
- dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
- dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
- dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
- dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
- dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
- dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
- dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
- dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
- dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
- dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
- dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
- dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
- dstack/_internal/server/statics/index.html +3 -0
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
- dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
- dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
- dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
- dstack/_internal/server/testing/__init__.py +0 -0
- dstack/_internal/server/testing/common.py +1220 -0
- dstack/_internal/server/testing/conf.py +53 -0
- dstack/_internal/server/testing/matchers.py +31 -0
- dstack/_internal/server/utils/__init__.py +0 -0
- dstack/_internal/server/utils/common.py +55 -0
- dstack/_internal/server/utils/logging.py +51 -0
- dstack/_internal/server/utils/provisioning.py +368 -0
- dstack/_internal/server/utils/routers.py +166 -0
- dstack/_internal/server/utils/sentry_utils.py +24 -0
- dstack/_internal/settings.py +49 -0
- dstack/_internal/utils/__init__.py +0 -0
- dstack/_internal/utils/common.py +318 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/_internal/utils/crypto.py +40 -0
- dstack/_internal/utils/env.py +88 -0
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/gpu.py +59 -0
- dstack/_internal/utils/hash.py +31 -0
- dstack/_internal/utils/interpolator.py +91 -0
- dstack/_internal/utils/json_schema.py +11 -0
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/_internal/utils/logging.py +5 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/network.py +50 -0
- dstack/_internal/utils/path.py +57 -0
- dstack/_internal/utils/random_names.py +258 -0
- dstack/_internal/utils/ssh.py +346 -0
- dstack/_internal/utils/tags.py +42 -0
- dstack/_internal/utils/typing.py +14 -0
- dstack/_internal/utils/version.py +22 -0
- dstack/api/__init__.py +46 -0
- dstack/api/_public/__init__.py +96 -0
- dstack/api/_public/backends.py +42 -0
- dstack/api/_public/common.py +5 -0
- dstack/api/_public/repos.py +202 -0
- dstack/api/_public/runs.py +714 -0
- dstack/api/server/__init__.py +206 -0
- dstack/api/server/_auth.py +30 -0
- dstack/api/server/_backends.py +38 -0
- dstack/api/server/_events.py +64 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_fleets.py +82 -0
- dstack/api/server/_gateways.py +54 -0
- dstack/api/server/_gpus.py +27 -0
- dstack/api/server/_group.py +22 -0
- dstack/api/server/_logs.py +15 -0
- dstack/api/server/_metrics.py +23 -0
- dstack/api/server/_projects.py +124 -0
- dstack/api/server/_repos.py +64 -0
- dstack/api/server/_runs.py +102 -0
- dstack/api/server/_secrets.py +36 -0
- dstack/api/server/_users.py +82 -0
- dstack/api/server/_volumes.py +39 -0
- dstack/api/server/utils.py +34 -0
- dstack/api/utils.py +105 -0
- dstack/core/__init__.py +0 -0
- dstack/plugins/__init__.py +8 -0
- dstack/plugins/_base.py +72 -0
- dstack/plugins/_models.py +8 -0
- dstack/plugins/_utils.py +19 -0
- dstack/plugins/builtin/__init__.py +0 -0
- dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
- dstack/plugins/builtin/rest_plugin/_models.py +48 -0
- dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
- dstack/version.py +3 -1
- dstack-0.20.7.dist-info/METADATA +519 -0
- dstack-0.20.7.dist-info/RECORD +720 -0
- {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
- dstack-0.20.7.dist-info/entry_points.txt +2 -0
- dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
- dstack/aws/__init__.py +0 -180
- dstack/aws/artifacts.py +0 -111
- dstack/aws/config.py +0 -40
- dstack/aws/jobs.py +0 -245
- dstack/aws/logs.py +0 -186
- dstack/aws/repos.py +0 -137
- dstack/aws/run_names.py +0 -17
- dstack/aws/runners.py +0 -693
- dstack/aws/runs.py +0 -79
- dstack/aws/secrets.py +0 -99
- dstack/aws/tags.py +0 -138
- dstack/backend.py +0 -299
- dstack/cli/app.py +0 -41
- dstack/cli/artifacts.py +0 -87
- dstack/cli/common.py +0 -57
- dstack/cli/config.py +0 -194
- dstack/cli/dashboard.py +0 -26
- dstack/cli/delete.py +0 -49
- dstack/cli/init.py +0 -33
- dstack/cli/logs.py +0 -87
- dstack/cli/main.py +0 -81
- dstack/cli/restart.py +0 -43
- dstack/cli/run.py +0 -223
- dstack/cli/schema.py +0 -46
- dstack/cli/secrets.py +0 -97
- dstack/cli/status.py +0 -140
- dstack/cli/stop.py +0 -53
- dstack/cli/tags.py +0 -100
- dstack/config.py +0 -80
- dstack/dashboard/artifacts.py +0 -26
- dstack/dashboard/logs.py +0 -73
- dstack/dashboard/main.py +0 -45
- dstack/dashboard/repos.py +0 -41
- dstack/dashboard/runs.py +0 -140
- dstack/dashboard/secrets.py +0 -53
- dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
- dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/dashboard/statics/assets/browserconfig.xml +0 -15
- dstack/dashboard/statics/assets/coast-228x228.png +0 -0
- dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
- dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
- dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
- dstack/dashboard/statics/assets/favicon.ico +0 -0
- dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
- dstack/dashboard/statics/assets/manifest.webapp +0 -14
- dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
- dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
- dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
- dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
- dstack/dashboard/statics/index.html +0 -7
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
- dstack/dashboard/statics/main.css +0 -5058
- dstack/dashboard/statics/splash_thumbnail.png +0 -0
- dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
- dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
- dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
- dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
- dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
- dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
- dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
- dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
- dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
- dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
- dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
- dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
- dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
- dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
- dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
- dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
- dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
- dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
- dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
- dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
- dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
- dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
- dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
- dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
- dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
- dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
- dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
- dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
- dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
- dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
- dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
- dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
- dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
- dstack/dashboard/tags.py +0 -119
- dstack/jobs.py +0 -255
- dstack/providers/__init__.py +0 -316
- dstack/providers/_python/main.py +0 -88
- dstack/providers/_tensorboard/main.py +0 -93
- dstack/providers/_torchrun/main.py +0 -121
- dstack/providers/bash/main.py +0 -90
- dstack/providers/code/main.py +0 -95
- dstack/providers/docker/main.py +0 -79
- dstack/providers/lab/main.py +0 -95
- dstack/providers/notebook/main.py +0 -90
- dstack/random_name.py +0 -29
- dstack/repo.py +0 -135
- dstack/runners.py +0 -35
- dstack/util.py +0 -15
- dstack-0.0.9.dist-info/METADATA +0 -176
- dstack-0.0.9.dist-info/RECORD +0 -179
- dstack-0.0.9.dist-info/entry_points.txt +0 -3
- dstack-0.0.9.dist-info/top_level.txt +0 -2
- tests/test_config.py +0 -70
- /dstack/{cli → _internal}/__init__.py +0 -0
- /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
- /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
- /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
- /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
- /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
- /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
- /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
- /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
- /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
- {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
- /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
- /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
|
@@ -0,0 +1,842 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
import json
|
|
4
|
+
from typing import List, Optional, Set, Tuple
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import and_, func, or_, select
|
|
7
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
+
from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only, with_loader_criteria
|
|
9
|
+
|
|
10
|
+
import dstack._internal.server.services.services.autoscalers as autoscalers
|
|
11
|
+
from dstack._internal.core.errors import ServerError
|
|
12
|
+
from dstack._internal.core.models.configurations import ReplicaGroup
|
|
13
|
+
from dstack._internal.core.models.profiles import RetryEvent, StopCriteria
|
|
14
|
+
from dstack._internal.core.models.runs import (
|
|
15
|
+
Job,
|
|
16
|
+
JobSpec,
|
|
17
|
+
JobStatus,
|
|
18
|
+
JobTerminationReason,
|
|
19
|
+
Run,
|
|
20
|
+
RunSpec,
|
|
21
|
+
RunStatus,
|
|
22
|
+
RunTerminationReason,
|
|
23
|
+
)
|
|
24
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
25
|
+
from dstack._internal.server.models import (
|
|
26
|
+
FleetModel,
|
|
27
|
+
InstanceModel,
|
|
28
|
+
JobModel,
|
|
29
|
+
ProjectModel,
|
|
30
|
+
RunModel,
|
|
31
|
+
UserModel,
|
|
32
|
+
)
|
|
33
|
+
from dstack._internal.server.services.jobs import (
|
|
34
|
+
find_job,
|
|
35
|
+
get_job_specs_from_run_spec,
|
|
36
|
+
group_jobs_by_replica_latest,
|
|
37
|
+
is_master_job,
|
|
38
|
+
job_model_to_job_submission,
|
|
39
|
+
switch_job_status,
|
|
40
|
+
)
|
|
41
|
+
from dstack._internal.server.services.locking import get_locker
|
|
42
|
+
from dstack._internal.server.services.prometheus.client_metrics import run_metrics
|
|
43
|
+
from dstack._internal.server.services.runs import (
|
|
44
|
+
fmt,
|
|
45
|
+
process_terminating_run,
|
|
46
|
+
run_model_to_run,
|
|
47
|
+
switch_run_status,
|
|
48
|
+
)
|
|
49
|
+
from dstack._internal.server.services.runs.replicas import (
|
|
50
|
+
build_replica_lists,
|
|
51
|
+
has_out_of_date_replicas,
|
|
52
|
+
is_replica_registered,
|
|
53
|
+
job_belongs_to_group,
|
|
54
|
+
retry_run_replica_jobs,
|
|
55
|
+
scale_down_replicas,
|
|
56
|
+
scale_run_replicas,
|
|
57
|
+
scale_run_replicas_per_group,
|
|
58
|
+
)
|
|
59
|
+
from dstack._internal.server.services.secrets import get_project_secrets_mapping
|
|
60
|
+
from dstack._internal.server.services.services import update_service_desired_replica_count
|
|
61
|
+
from dstack._internal.server.utils import sentry_utils
|
|
62
|
+
from dstack._internal.utils import common
|
|
63
|
+
from dstack._internal.utils.logging import get_logger
|
|
64
|
+
|
|
65
|
+
logger = get_logger(__name__)
|
|
66
|
+
|
|
67
|
+
MIN_PROCESSING_INTERVAL = datetime.timedelta(seconds=5)
|
|
68
|
+
ROLLING_DEPLOYMENT_MAX_SURGE = 1 # at most one extra replica during rolling deployment
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def process_runs(batch_size: int = 1):
|
|
72
|
+
tasks = []
|
|
73
|
+
for _ in range(batch_size):
|
|
74
|
+
tasks.append(_process_next_run())
|
|
75
|
+
await asyncio.gather(*tasks)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@sentry_utils.instrument_background_task
|
|
79
|
+
async def _process_next_run():
|
|
80
|
+
run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
|
|
81
|
+
job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
82
|
+
now = common.get_current_datetime()
|
|
83
|
+
async with get_session_ctx() as session:
|
|
84
|
+
async with run_lock, job_lock:
|
|
85
|
+
res = await session.execute(
|
|
86
|
+
select(RunModel)
|
|
87
|
+
.where(
|
|
88
|
+
RunModel.id.not_in(run_lockset),
|
|
89
|
+
RunModel.last_processed_at < now - MIN_PROCESSING_INTERVAL,
|
|
90
|
+
# Filter out runs that don't need to be processed.
|
|
91
|
+
# This is only to reduce unnecessary commits.
|
|
92
|
+
# Otherwise, we could fetch all active runs and filter them when processing.
|
|
93
|
+
or_(
|
|
94
|
+
# Active non-pending runs:
|
|
95
|
+
RunModel.status.not_in(
|
|
96
|
+
RunStatus.finished_statuses() + [RunStatus.PENDING]
|
|
97
|
+
),
|
|
98
|
+
# Retrying runs:
|
|
99
|
+
and_(
|
|
100
|
+
RunModel.status == RunStatus.PENDING,
|
|
101
|
+
RunModel.resubmission_attempt > 0,
|
|
102
|
+
),
|
|
103
|
+
# Scheduled ready runs:
|
|
104
|
+
and_(
|
|
105
|
+
RunModel.status == RunStatus.PENDING,
|
|
106
|
+
RunModel.resubmission_attempt == 0,
|
|
107
|
+
RunModel.next_triggered_at.is_not(None),
|
|
108
|
+
RunModel.next_triggered_at < now,
|
|
109
|
+
),
|
|
110
|
+
# Scaled-to-zero runs:
|
|
111
|
+
# Such runs cannot be scheduled, thus we check next_triggered_at.
|
|
112
|
+
# If we allow scheduled services with downscaling to zero
|
|
113
|
+
# This check won't pass.
|
|
114
|
+
and_(
|
|
115
|
+
RunModel.status == RunStatus.PENDING,
|
|
116
|
+
RunModel.resubmission_attempt == 0,
|
|
117
|
+
RunModel.next_triggered_at.is_(None),
|
|
118
|
+
),
|
|
119
|
+
),
|
|
120
|
+
)
|
|
121
|
+
.options(
|
|
122
|
+
joinedload(RunModel.jobs).load_only(JobModel.id),
|
|
123
|
+
# No need to lock finished jobs
|
|
124
|
+
with_loader_criteria(
|
|
125
|
+
JobModel,
|
|
126
|
+
JobModel.status.not_in(JobStatus.finished_statuses()),
|
|
127
|
+
include_aliases=True,
|
|
128
|
+
),
|
|
129
|
+
)
|
|
130
|
+
.options(load_only(RunModel.id))
|
|
131
|
+
.order_by(RunModel.last_processed_at.asc())
|
|
132
|
+
.limit(1)
|
|
133
|
+
.with_for_update(skip_locked=True, key_share=True, of=RunModel)
|
|
134
|
+
)
|
|
135
|
+
run_model = res.scalar()
|
|
136
|
+
if run_model is None:
|
|
137
|
+
return
|
|
138
|
+
res = await session.execute(
|
|
139
|
+
select(JobModel)
|
|
140
|
+
.where(
|
|
141
|
+
JobModel.run_id == run_model.id,
|
|
142
|
+
JobModel.id.not_in(job_lockset),
|
|
143
|
+
)
|
|
144
|
+
.options(
|
|
145
|
+
load_only(JobModel.id),
|
|
146
|
+
with_loader_criteria(
|
|
147
|
+
JobModel,
|
|
148
|
+
JobModel.status.not_in(JobStatus.finished_statuses()),
|
|
149
|
+
include_aliases=True,
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
.order_by(JobModel.id) # take locks in order
|
|
153
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
154
|
+
)
|
|
155
|
+
job_models = res.scalars().all()
|
|
156
|
+
if len(run_model.jobs) != len(job_models):
|
|
157
|
+
# Some jobs are locked or there was a non-repeatable read
|
|
158
|
+
return
|
|
159
|
+
job_ids = [j.id for j in run_model.jobs]
|
|
160
|
+
run_lockset.add(run_model.id)
|
|
161
|
+
job_lockset.update(job_ids)
|
|
162
|
+
run_model_id = run_model.id
|
|
163
|
+
try:
|
|
164
|
+
await _process_run(session=session, run_model=run_model)
|
|
165
|
+
finally:
|
|
166
|
+
run_lockset.difference_update([run_model_id])
|
|
167
|
+
job_lockset.difference_update(job_ids)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
async def _process_run(session: AsyncSession, run_model: RunModel):
|
|
171
|
+
run_model = await _refetch_run_model(session, run_model)
|
|
172
|
+
logger.debug("%s: processing run", fmt(run_model))
|
|
173
|
+
try:
|
|
174
|
+
if run_model.status == RunStatus.PENDING:
|
|
175
|
+
await _process_pending_run(session, run_model)
|
|
176
|
+
elif run_model.status in {RunStatus.SUBMITTED, RunStatus.PROVISIONING, RunStatus.RUNNING}:
|
|
177
|
+
await _process_active_run(session, run_model)
|
|
178
|
+
elif run_model.status == RunStatus.TERMINATING:
|
|
179
|
+
await process_terminating_run(session, run_model)
|
|
180
|
+
else:
|
|
181
|
+
logger.error("%s: unexpected status %s", fmt(run_model), run_model.status.name)
|
|
182
|
+
run_model.termination_reason = RunTerminationReason.SERVER_ERROR
|
|
183
|
+
switch_run_status(session, run_model, RunStatus.TERMINATING)
|
|
184
|
+
except ServerError as e:
|
|
185
|
+
logger.error("%s: run processing error: %s", fmt(run_model), e)
|
|
186
|
+
run_model.termination_reason = RunTerminationReason.SERVER_ERROR
|
|
187
|
+
switch_run_status(session, run_model, RunStatus.TERMINATING)
|
|
188
|
+
|
|
189
|
+
run_model.last_processed_at = common.get_current_datetime()
|
|
190
|
+
await session.commit()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
async def _refetch_run_model(session: AsyncSession, run_model: RunModel) -> RunModel:
|
|
194
|
+
# Select only latest submissions for every job.
|
|
195
|
+
latest_submissions_sq = (
|
|
196
|
+
select(
|
|
197
|
+
JobModel.run_id.label("run_id"),
|
|
198
|
+
JobModel.replica_num.label("replica_num"),
|
|
199
|
+
JobModel.job_num.label("job_num"),
|
|
200
|
+
func.max(JobModel.submission_num).label("max_submission_num"),
|
|
201
|
+
)
|
|
202
|
+
.where(JobModel.run_id == run_model.id)
|
|
203
|
+
.group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num)
|
|
204
|
+
.subquery()
|
|
205
|
+
)
|
|
206
|
+
job_alias = aliased(JobModel)
|
|
207
|
+
res = await session.execute(
|
|
208
|
+
select(RunModel)
|
|
209
|
+
.where(RunModel.id == run_model.id)
|
|
210
|
+
.outerjoin(latest_submissions_sq, latest_submissions_sq.c.run_id == RunModel.id)
|
|
211
|
+
.outerjoin(
|
|
212
|
+
job_alias,
|
|
213
|
+
onclause=and_(
|
|
214
|
+
job_alias.run_id == latest_submissions_sq.c.run_id,
|
|
215
|
+
job_alias.replica_num == latest_submissions_sq.c.replica_num,
|
|
216
|
+
job_alias.job_num == latest_submissions_sq.c.job_num,
|
|
217
|
+
job_alias.submission_num == latest_submissions_sq.c.max_submission_num,
|
|
218
|
+
),
|
|
219
|
+
)
|
|
220
|
+
.options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
|
|
221
|
+
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
222
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
223
|
+
.options(
|
|
224
|
+
contains_eager(RunModel.jobs, alias=job_alias)
|
|
225
|
+
.joinedload(JobModel.instance)
|
|
226
|
+
.load_only(InstanceModel.fleet_id)
|
|
227
|
+
)
|
|
228
|
+
.execution_options(populate_existing=True)
|
|
229
|
+
)
|
|
230
|
+
return res.unique().scalar_one()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
async def _process_pending_run(session: AsyncSession, run_model: RunModel):
|
|
234
|
+
"""Jobs are not created yet"""
|
|
235
|
+
run = run_model_to_run(run_model)
|
|
236
|
+
|
|
237
|
+
# TODO: Do not select such runs in the first place to avoid redundant processing
|
|
238
|
+
if run_model.resubmission_attempt > 0 and not _retrying_run_ready_for_resubmission(
|
|
239
|
+
run_model, run
|
|
240
|
+
):
|
|
241
|
+
logger.debug("%s: retrying run is not yet ready for resubmission", fmt(run_model))
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
if run.run_spec.configuration.type == "service":
|
|
245
|
+
run_model.desired_replica_count = sum(
|
|
246
|
+
group.count.min or 0 for group in run.run_spec.configuration.replica_groups
|
|
247
|
+
)
|
|
248
|
+
await update_service_desired_replica_count(
|
|
249
|
+
session,
|
|
250
|
+
run_model,
|
|
251
|
+
run.run_spec.configuration,
|
|
252
|
+
# does not matter for pending services, since 0->n scaling should happen without delay
|
|
253
|
+
last_scaled_at=None,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if run_model.desired_replica_count == 0:
|
|
257
|
+
# stay zero scaled
|
|
258
|
+
return
|
|
259
|
+
|
|
260
|
+
replicas: List[ReplicaGroup] = run.run_spec.configuration.replica_groups
|
|
261
|
+
|
|
262
|
+
await scale_run_replicas_per_group(session, run_model, replicas)
|
|
263
|
+
else:
|
|
264
|
+
run_model.desired_replica_count = 1
|
|
265
|
+
await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count)
|
|
266
|
+
|
|
267
|
+
switch_run_status(session=session, run_model=run_model, new_status=RunStatus.SUBMITTED)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _retrying_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
|
|
271
|
+
if run.latest_job_submission is None:
|
|
272
|
+
# Should not be possible
|
|
273
|
+
return True
|
|
274
|
+
duration_since_processing = (
|
|
275
|
+
common.get_current_datetime() - run.latest_job_submission.last_processed_at
|
|
276
|
+
)
|
|
277
|
+
if duration_since_processing < _get_retry_delay(run_model.resubmission_attempt):
|
|
278
|
+
return False
|
|
279
|
+
return True
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# We use exponentially increasing retry delays for pending runs.
|
|
283
|
+
# This prevents creation of too many job submissions for runs stuck in pending,
|
|
284
|
+
# e.g. when users set retry for a long period without capacity.
|
|
285
|
+
_PENDING_RETRY_DELAYS = [
|
|
286
|
+
datetime.timedelta(seconds=15),
|
|
287
|
+
datetime.timedelta(seconds=30),
|
|
288
|
+
datetime.timedelta(minutes=1),
|
|
289
|
+
datetime.timedelta(minutes=2),
|
|
290
|
+
datetime.timedelta(minutes=5),
|
|
291
|
+
datetime.timedelta(minutes=10),
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _get_retry_delay(resubmission_attempt: int) -> datetime.timedelta:
|
|
296
|
+
if resubmission_attempt - 1 < len(_PENDING_RETRY_DELAYS):
|
|
297
|
+
return _PENDING_RETRY_DELAYS[resubmission_attempt - 1]
|
|
298
|
+
return _PENDING_RETRY_DELAYS[-1]
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
302
|
+
"""
|
|
303
|
+
Run is submitted, provisioning, or running.
|
|
304
|
+
We handle fails, scaling, and status changes.
|
|
305
|
+
"""
|
|
306
|
+
run = run_model_to_run(run_model)
|
|
307
|
+
run_spec = run.run_spec
|
|
308
|
+
retry_single_job = _can_retry_single_job(run_spec)
|
|
309
|
+
|
|
310
|
+
run_statuses: Set[RunStatus] = set()
|
|
311
|
+
run_termination_reasons: Set[RunTerminationReason] = set()
|
|
312
|
+
replicas_to_retry: List[Tuple[int, List[JobModel]]] = []
|
|
313
|
+
|
|
314
|
+
replicas_info: List[autoscalers.ReplicaInfo] = []
|
|
315
|
+
for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
|
|
316
|
+
replica_statuses: Set[RunStatus] = set()
|
|
317
|
+
replica_needs_retry = False
|
|
318
|
+
replica_active = True
|
|
319
|
+
jobs_done_num = 0
|
|
320
|
+
for job_model in job_models:
|
|
321
|
+
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
322
|
+
if (
|
|
323
|
+
run_model.fleet_id is None
|
|
324
|
+
and job_model.instance is not None
|
|
325
|
+
and job_model.instance.fleet_id is not None
|
|
326
|
+
):
|
|
327
|
+
run_model.fleet_id = job_model.instance.fleet_id
|
|
328
|
+
if job_model.status == JobStatus.DONE or (
|
|
329
|
+
job_model.status == JobStatus.TERMINATING
|
|
330
|
+
and job_model.termination_reason == JobTerminationReason.DONE_BY_RUNNER
|
|
331
|
+
):
|
|
332
|
+
# the job is done or going to be done
|
|
333
|
+
replica_statuses.add(RunStatus.DONE)
|
|
334
|
+
jobs_done_num += 1
|
|
335
|
+
elif job_model.termination_reason == JobTerminationReason.SCALED_DOWN:
|
|
336
|
+
# the job was scaled down
|
|
337
|
+
replica_active = False
|
|
338
|
+
elif job_model.status == JobStatus.RUNNING:
|
|
339
|
+
# the job is running
|
|
340
|
+
replica_statuses.add(RunStatus.RUNNING)
|
|
341
|
+
elif job_model.status in {JobStatus.PROVISIONING, JobStatus.PULLING}:
|
|
342
|
+
# the job is provisioning
|
|
343
|
+
replica_statuses.add(RunStatus.PROVISIONING)
|
|
344
|
+
elif job_model.status == JobStatus.SUBMITTED:
|
|
345
|
+
# the job is submitted
|
|
346
|
+
replica_statuses.add(RunStatus.SUBMITTED)
|
|
347
|
+
elif job_model.status == JobStatus.FAILED or (
|
|
348
|
+
job_model.status
|
|
349
|
+
in [JobStatus.TERMINATING, JobStatus.TERMINATED, JobStatus.ABORTED]
|
|
350
|
+
and job_model.termination_reason
|
|
351
|
+
not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN}
|
|
352
|
+
):
|
|
353
|
+
current_duration = await _should_retry_job(session, run, job, job_model)
|
|
354
|
+
if current_duration is None:
|
|
355
|
+
replica_statuses.add(RunStatus.FAILED)
|
|
356
|
+
run_termination_reasons.add(RunTerminationReason.JOB_FAILED)
|
|
357
|
+
else:
|
|
358
|
+
if _is_retry_duration_exceeded(job, current_duration):
|
|
359
|
+
replica_statuses.add(RunStatus.FAILED)
|
|
360
|
+
run_termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED)
|
|
361
|
+
else:
|
|
362
|
+
replica_needs_retry = True
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f"Unexpected job status {job_model.status}")
|
|
365
|
+
|
|
366
|
+
if RunStatus.FAILED in replica_statuses:
|
|
367
|
+
run_statuses.add(RunStatus.FAILED)
|
|
368
|
+
else:
|
|
369
|
+
if replica_needs_retry:
|
|
370
|
+
replicas_to_retry.append((replica_num, job_models))
|
|
371
|
+
if not replica_needs_retry or retry_single_job:
|
|
372
|
+
run_statuses.update(replica_statuses)
|
|
373
|
+
|
|
374
|
+
if jobs_done_num == len(job_models):
|
|
375
|
+
# Consider replica inactive if all its jobs are done for some reason.
|
|
376
|
+
# If only some jobs are done, replica is considered active to avoid
|
|
377
|
+
# provisioning new replicas for partially done multi-node tasks.
|
|
378
|
+
replica_active = False
|
|
379
|
+
|
|
380
|
+
replica_info = _get_replica_info(job_models, replica_active)
|
|
381
|
+
replicas_info.append(replica_info)
|
|
382
|
+
|
|
383
|
+
termination_reason: Optional[RunTerminationReason] = None
|
|
384
|
+
if RunStatus.FAILED in run_statuses:
|
|
385
|
+
new_status = RunStatus.TERMINATING
|
|
386
|
+
if RunTerminationReason.JOB_FAILED in run_termination_reasons:
|
|
387
|
+
termination_reason = RunTerminationReason.JOB_FAILED
|
|
388
|
+
elif RunTerminationReason.RETRY_LIMIT_EXCEEDED in run_termination_reasons:
|
|
389
|
+
termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED
|
|
390
|
+
else:
|
|
391
|
+
raise ValueError(f"Unexpected termination reason {run_termination_reasons}")
|
|
392
|
+
elif _should_stop_on_master_done(run):
|
|
393
|
+
new_status = RunStatus.TERMINATING
|
|
394
|
+
# ALL_JOBS_DONE is used for all DONE reasons including master-done
|
|
395
|
+
termination_reason = RunTerminationReason.ALL_JOBS_DONE
|
|
396
|
+
elif RunStatus.RUNNING in run_statuses:
|
|
397
|
+
new_status = RunStatus.RUNNING
|
|
398
|
+
elif RunStatus.PROVISIONING in run_statuses:
|
|
399
|
+
new_status = RunStatus.PROVISIONING
|
|
400
|
+
elif RunStatus.SUBMITTED in run_statuses:
|
|
401
|
+
new_status = RunStatus.SUBMITTED
|
|
402
|
+
elif RunStatus.DONE in run_statuses and not replicas_to_retry:
|
|
403
|
+
new_status = RunStatus.TERMINATING
|
|
404
|
+
termination_reason = RunTerminationReason.ALL_JOBS_DONE
|
|
405
|
+
else:
|
|
406
|
+
new_status = RunStatus.PENDING
|
|
407
|
+
|
|
408
|
+
# Terminate active jobs if the run is to be resubmitted
|
|
409
|
+
if new_status == RunStatus.PENDING and not retry_single_job:
|
|
410
|
+
for _, replica_jobs in replicas_to_retry:
|
|
411
|
+
for job_model in replica_jobs:
|
|
412
|
+
if not (
|
|
413
|
+
job_model.status.is_finished() or job_model.status == JobStatus.TERMINATING
|
|
414
|
+
):
|
|
415
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
416
|
+
job_model.termination_reason_message = "Run is to be resubmitted"
|
|
417
|
+
switch_job_status(session, job_model, JobStatus.TERMINATING)
|
|
418
|
+
|
|
419
|
+
if new_status not in {RunStatus.TERMINATING, RunStatus.PENDING}:
|
|
420
|
+
# No need to retry, scale, or redeploy replicas if the run is terminating,
|
|
421
|
+
# pending run will retry replicas in `process_pending_run`
|
|
422
|
+
await _handle_run_replicas(
|
|
423
|
+
session, run_model, run_spec, replicas_to_retry, retry_single_job, replicas_info
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
if run_model.status != new_status:
|
|
427
|
+
if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
|
|
428
|
+
current_time = common.get_current_datetime()
|
|
429
|
+
submit_to_provision_duration = (current_time - run_model.submitted_at).total_seconds()
|
|
430
|
+
logger.info(
|
|
431
|
+
"%s: run took %.2f seconds from submission to provisioning.",
|
|
432
|
+
fmt(run_model),
|
|
433
|
+
submit_to_provision_duration,
|
|
434
|
+
)
|
|
435
|
+
project_name = run_model.project.name
|
|
436
|
+
run_metrics.log_submit_to_provision_duration(
|
|
437
|
+
submit_to_provision_duration, project_name, run_spec.configuration.type
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
if new_status == RunStatus.PENDING:
|
|
441
|
+
run_metrics.increment_pending_runs(run_model.project.name, run_spec.configuration.type)
|
|
442
|
+
# Unassign run from fleet so that the new fleet can be chosen when retrying
|
|
443
|
+
run_model.fleet = None
|
|
444
|
+
|
|
445
|
+
run_model.termination_reason = termination_reason
|
|
446
|
+
switch_run_status(session, run_model, new_status)
|
|
447
|
+
# While a run goes to pending without provisioning, resubmission_attempt increases.
|
|
448
|
+
if new_status == RunStatus.PROVISIONING:
|
|
449
|
+
run_model.resubmission_attempt = 0
|
|
450
|
+
elif new_status == RunStatus.PENDING:
|
|
451
|
+
run_model.resubmission_attempt += 1
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _get_replica_info(
|
|
455
|
+
replica_job_models: list[JobModel],
|
|
456
|
+
replica_active: bool,
|
|
457
|
+
) -> autoscalers.ReplicaInfo:
|
|
458
|
+
if replica_active:
|
|
459
|
+
# submitted_at = replica created
|
|
460
|
+
return autoscalers.ReplicaInfo(
|
|
461
|
+
active=True,
|
|
462
|
+
timestamp=min(job.submitted_at for job in replica_job_models),
|
|
463
|
+
)
|
|
464
|
+
# last_processed_at = replica scaled down
|
|
465
|
+
return autoscalers.ReplicaInfo(
|
|
466
|
+
active=False,
|
|
467
|
+
timestamp=max(job.last_processed_at for job in replica_job_models),
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
async def _handle_run_replicas(
|
|
472
|
+
session: AsyncSession,
|
|
473
|
+
run_model: RunModel,
|
|
474
|
+
run_spec: RunSpec,
|
|
475
|
+
replicas_to_retry: list[tuple[int, list[JobModel]]],
|
|
476
|
+
retry_single_job: bool,
|
|
477
|
+
replicas_info: list[autoscalers.ReplicaInfo],
|
|
478
|
+
) -> None:
|
|
479
|
+
"""
|
|
480
|
+
Does ONE of:
|
|
481
|
+
- replica retry
|
|
482
|
+
- replica scaling
|
|
483
|
+
- replica rolling deployment
|
|
484
|
+
|
|
485
|
+
Does not do everything at once to avoid conflicts between the stages and long DB transactions.
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
if replicas_to_retry:
|
|
489
|
+
for _, replica_jobs in replicas_to_retry:
|
|
490
|
+
await retry_run_replica_jobs(
|
|
491
|
+
session, run_model, replica_jobs, only_failed=retry_single_job
|
|
492
|
+
)
|
|
493
|
+
return
|
|
494
|
+
|
|
495
|
+
if run_spec.configuration.type == "service":
|
|
496
|
+
await update_service_desired_replica_count(
|
|
497
|
+
session,
|
|
498
|
+
run_model,
|
|
499
|
+
run_spec.configuration,
|
|
500
|
+
# FIXME: should only include scaling events, not retries and deployments
|
|
501
|
+
last_scaled_at=max((r.timestamp for r in replicas_info), default=None),
|
|
502
|
+
)
|
|
503
|
+
replicas: List[ReplicaGroup] = run_spec.configuration.replica_groups
|
|
504
|
+
assert replicas, "replica groups should always return at least one group"
|
|
505
|
+
|
|
506
|
+
await scale_run_replicas_per_group(session, run_model, replicas)
|
|
507
|
+
|
|
508
|
+
# Handle per-group rolling deployment
|
|
509
|
+
await _update_jobs_to_new_deployment_in_place(
|
|
510
|
+
session=session,
|
|
511
|
+
run_model=run_model,
|
|
512
|
+
run_spec=run_spec,
|
|
513
|
+
replicas=replicas,
|
|
514
|
+
)
|
|
515
|
+
# Process per-group rolling deployment
|
|
516
|
+
for group in replicas:
|
|
517
|
+
await _handle_rolling_deployment_for_group(
|
|
518
|
+
session=session, run_model=run_model, group=group, run_spec=run_spec
|
|
519
|
+
)
|
|
520
|
+
# Terminate replicas from groups that were removed from the configuration
|
|
521
|
+
existing_group_names = set()
|
|
522
|
+
for job in run_model.jobs:
|
|
523
|
+
if job.status.is_finished():
|
|
524
|
+
continue
|
|
525
|
+
try:
|
|
526
|
+
job_spec = JobSpec.__response__.parse_raw(job.job_spec_data)
|
|
527
|
+
existing_group_names.add(job_spec.replica_group)
|
|
528
|
+
except Exception:
|
|
529
|
+
continue
|
|
530
|
+
new_group_names = {group.name for group in replicas}
|
|
531
|
+
removed_group_names = existing_group_names - new_group_names
|
|
532
|
+
for removed_group_name in removed_group_names:
|
|
533
|
+
# Build replica lists for this removed group
|
|
534
|
+
active_replicas, inactive_replicas = build_replica_lists(
|
|
535
|
+
run_model=run_model,
|
|
536
|
+
group_filter=removed_group_name,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
total_replicas = len(active_replicas) + len(inactive_replicas)
|
|
540
|
+
if total_replicas > 0:
|
|
541
|
+
logger.info(
|
|
542
|
+
"%s: terminating %d replica(s) from removed group '%s'",
|
|
543
|
+
fmt(run_model),
|
|
544
|
+
total_replicas,
|
|
545
|
+
removed_group_name,
|
|
546
|
+
)
|
|
547
|
+
# Terminate all active replicas in the removed group
|
|
548
|
+
if active_replicas:
|
|
549
|
+
scale_down_replicas(session, active_replicas, len(active_replicas))
|
|
550
|
+
# Terminate all inactive replicas in the removed group
|
|
551
|
+
if inactive_replicas:
|
|
552
|
+
scale_down_replicas(session, inactive_replicas, len(inactive_replicas))
|
|
553
|
+
return
|
|
554
|
+
|
|
555
|
+
max_replica_count = run_model.desired_replica_count
|
|
556
|
+
if has_out_of_date_replicas(run_model):
|
|
557
|
+
# allow extra replicas when deployment is in progress
|
|
558
|
+
max_replica_count += ROLLING_DEPLOYMENT_MAX_SURGE
|
|
559
|
+
|
|
560
|
+
active_replica_count = sum(1 for r in replicas_info if r.active)
|
|
561
|
+
if active_replica_count not in range(run_model.desired_replica_count, max_replica_count + 1):
|
|
562
|
+
await scale_run_replicas(
|
|
563
|
+
session,
|
|
564
|
+
run_model,
|
|
565
|
+
replicas_diff=run_model.desired_replica_count - active_replica_count,
|
|
566
|
+
)
|
|
567
|
+
return
|
|
568
|
+
|
|
569
|
+
await _update_jobs_to_new_deployment_in_place(
|
|
570
|
+
session=session,
|
|
571
|
+
run_model=run_model,
|
|
572
|
+
run_spec=run_spec,
|
|
573
|
+
)
|
|
574
|
+
if has_out_of_date_replicas(run_model):
|
|
575
|
+
assert run_spec.configuration.type == "service", (
|
|
576
|
+
"Rolling deployment is only supported for services"
|
|
577
|
+
)
|
|
578
|
+
non_terminated_replica_count = len(
|
|
579
|
+
{j.replica_num for j in run_model.jobs if not j.status.is_finished()}
|
|
580
|
+
)
|
|
581
|
+
# Avoid using too much hardware during a deployment - never have
|
|
582
|
+
# more than max_replica_count non-terminated replicas.
|
|
583
|
+
if non_terminated_replica_count < max_replica_count:
|
|
584
|
+
# Start more up-to-date replicas that will eventually replace out-of-date replicas.
|
|
585
|
+
await scale_run_replicas(
|
|
586
|
+
session,
|
|
587
|
+
run_model,
|
|
588
|
+
replicas_diff=max_replica_count - non_terminated_replica_count,
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
replicas_to_stop_count = 0
|
|
592
|
+
# stop any out-of-date replicas that are not registered
|
|
593
|
+
replicas_to_stop_count += sum(
|
|
594
|
+
any(j.deployment_num < run_model.deployment_num for j in jobs)
|
|
595
|
+
and any(
|
|
596
|
+
j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses()
|
|
597
|
+
for j in jobs
|
|
598
|
+
)
|
|
599
|
+
and not is_replica_registered(jobs)
|
|
600
|
+
for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
|
|
601
|
+
)
|
|
602
|
+
# stop excessive registered out-of-date replicas, except those that are already `terminating`
|
|
603
|
+
non_terminating_registered_replicas_count = sum(
|
|
604
|
+
is_replica_registered(jobs) and all(j.status != JobStatus.TERMINATING for j in jobs)
|
|
605
|
+
for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
|
|
606
|
+
)
|
|
607
|
+
replicas_to_stop_count += max(
|
|
608
|
+
0, non_terminating_registered_replicas_count - run_model.desired_replica_count
|
|
609
|
+
)
|
|
610
|
+
if replicas_to_stop_count:
|
|
611
|
+
await scale_run_replicas(
|
|
612
|
+
session,
|
|
613
|
+
run_model,
|
|
614
|
+
replicas_diff=-replicas_to_stop_count,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
async def _update_jobs_to_new_deployment_in_place(
|
|
619
|
+
session: AsyncSession,
|
|
620
|
+
run_model: RunModel,
|
|
621
|
+
run_spec: RunSpec,
|
|
622
|
+
replicas: Optional[List] = None,
|
|
623
|
+
) -> None:
|
|
624
|
+
"""
|
|
625
|
+
Bump deployment_num for jobs that do not require redeployment.
|
|
626
|
+
"""
|
|
627
|
+
secrets = await get_project_secrets_mapping(
|
|
628
|
+
session=session,
|
|
629
|
+
project=run_model.project,
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
|
|
633
|
+
if all(j.status.is_finished() for j in job_models):
|
|
634
|
+
continue
|
|
635
|
+
if all(j.deployment_num == run_model.deployment_num for j in job_models):
|
|
636
|
+
continue
|
|
637
|
+
|
|
638
|
+
# Determine which group this replica belongs to
|
|
639
|
+
replica_group_name = None
|
|
640
|
+
|
|
641
|
+
if replicas:
|
|
642
|
+
job_spec = JobSpec.__response__.parse_raw(job_models[0].job_spec_data)
|
|
643
|
+
replica_group_name = job_spec.replica_group
|
|
644
|
+
|
|
645
|
+
# FIXME: Handle getting image configuration errors or skip it.
|
|
646
|
+
new_job_specs = await get_job_specs_from_run_spec(
|
|
647
|
+
run_spec=run_spec,
|
|
648
|
+
secrets=secrets,
|
|
649
|
+
replica_num=replica_num,
|
|
650
|
+
replica_group_name=replica_group_name,
|
|
651
|
+
)
|
|
652
|
+
assert len(new_job_specs) == len(job_models), (
|
|
653
|
+
"Changing the number of jobs within a replica is not yet supported"
|
|
654
|
+
)
|
|
655
|
+
can_update_all_jobs = True
|
|
656
|
+
for old_job_model, new_job_spec in zip(job_models, new_job_specs):
|
|
657
|
+
old_job_spec = JobSpec.__response__.parse_raw(old_job_model.job_spec_data)
|
|
658
|
+
if new_job_spec != old_job_spec:
|
|
659
|
+
can_update_all_jobs = False
|
|
660
|
+
break
|
|
661
|
+
if can_update_all_jobs:
|
|
662
|
+
for job_model in job_models:
|
|
663
|
+
job_model.deployment_num = run_model.deployment_num
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
async def _should_retry_job(
|
|
667
|
+
session: AsyncSession,
|
|
668
|
+
run: Run,
|
|
669
|
+
job: Job,
|
|
670
|
+
job_model: JobModel,
|
|
671
|
+
) -> Optional[datetime.timedelta]:
|
|
672
|
+
"""
|
|
673
|
+
Checks if the job should be retried.
|
|
674
|
+
Returns the current duration of retrying if retry is enabled.
|
|
675
|
+
Retrying duration is calculated as the time since `last_processed_at`
|
|
676
|
+
of the latest provisioned submission.
|
|
677
|
+
"""
|
|
678
|
+
if job.job_spec.retry is None:
|
|
679
|
+
return None
|
|
680
|
+
|
|
681
|
+
last_provisioned_submission = None
|
|
682
|
+
if len(job.job_submissions) > 0:
|
|
683
|
+
last_submission = job.job_submissions[-1]
|
|
684
|
+
if last_submission.job_provisioning_data is not None:
|
|
685
|
+
last_provisioned_submission = last_submission
|
|
686
|
+
else:
|
|
687
|
+
# The caller passes at most one latest submission in job.job_submissions, so check the db.
|
|
688
|
+
res = await session.execute(
|
|
689
|
+
select(JobModel)
|
|
690
|
+
.where(
|
|
691
|
+
JobModel.run_id == job_model.run_id,
|
|
692
|
+
JobModel.replica_num == job_model.replica_num,
|
|
693
|
+
JobModel.job_num == job_model.job_num,
|
|
694
|
+
JobModel.job_provisioning_data.is_not(None),
|
|
695
|
+
)
|
|
696
|
+
.order_by(JobModel.last_processed_at.desc())
|
|
697
|
+
.limit(1)
|
|
698
|
+
)
|
|
699
|
+
last_provisioned_submission_model = res.scalar()
|
|
700
|
+
if last_provisioned_submission_model is not None:
|
|
701
|
+
last_provisioned_submission = job_model_to_job_submission(
|
|
702
|
+
last_provisioned_submission_model
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
if (
|
|
706
|
+
job_model.termination_reason is not None
|
|
707
|
+
and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY
|
|
708
|
+
and last_provisioned_submission is None
|
|
709
|
+
and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events
|
|
710
|
+
):
|
|
711
|
+
return common.get_current_datetime() - run.submitted_at
|
|
712
|
+
|
|
713
|
+
if (
|
|
714
|
+
job_model.termination_reason is not None
|
|
715
|
+
and job_model.termination_reason.to_retry_event() in job.job_spec.retry.on_events
|
|
716
|
+
and last_provisioned_submission is not None
|
|
717
|
+
):
|
|
718
|
+
return common.get_current_datetime() - last_provisioned_submission.last_processed_at
|
|
719
|
+
|
|
720
|
+
return None
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def _is_retry_duration_exceeded(job: Job, current_duration: datetime.timedelta) -> bool:
|
|
724
|
+
if job.job_spec.retry is None:
|
|
725
|
+
return True
|
|
726
|
+
return current_duration > datetime.timedelta(seconds=job.job_spec.retry.duration)
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _can_retry_single_job(run_spec: RunSpec) -> bool:
|
|
730
|
+
# TODO: Currently, we terminate and retry the entire replica if one of the job fails.
|
|
731
|
+
# We could make partial retry in some multi-node cases.
|
|
732
|
+
# E.g. restarting a worker node, independent jobs.
|
|
733
|
+
return False
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def _should_stop_on_master_done(run: Run) -> bool:
|
|
737
|
+
if run.run_spec.merged_profile.stop_criteria != StopCriteria.MASTER_DONE:
|
|
738
|
+
return False
|
|
739
|
+
for job in run.jobs:
|
|
740
|
+
if is_master_job(job) and job.job_submissions[-1].status == JobStatus.DONE:
|
|
741
|
+
return True
|
|
742
|
+
return False
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
async def _handle_rolling_deployment_for_group(
|
|
746
|
+
session: AsyncSession, run_model: RunModel, group: ReplicaGroup, run_spec: RunSpec
|
|
747
|
+
) -> None:
|
|
748
|
+
"""
|
|
749
|
+
Handle rolling deployment for a single replica group.
|
|
750
|
+
"""
|
|
751
|
+
from dstack._internal.server.services.runs.replicas import (
|
|
752
|
+
build_replica_lists,
|
|
753
|
+
scale_run_replicas_for_group,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
desired_replica_counts = (
|
|
757
|
+
json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {}
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
group_desired = desired_replica_counts.get(group.name, group.count.min or 0)
|
|
761
|
+
|
|
762
|
+
# Check if group has out-of-date replicas
|
|
763
|
+
if not has_out_of_date_replicas(run_model, group_filter=group.name):
|
|
764
|
+
return # Group is up-to-date
|
|
765
|
+
|
|
766
|
+
# Calculate max replicas (allow surge during deployment)
|
|
767
|
+
group_max_replica_count = group_desired + ROLLING_DEPLOYMENT_MAX_SURGE
|
|
768
|
+
|
|
769
|
+
# Count non-terminated replicas for this group only
|
|
770
|
+
|
|
771
|
+
non_terminated_replica_count = len(
|
|
772
|
+
{
|
|
773
|
+
j.replica_num
|
|
774
|
+
for j in run_model.jobs
|
|
775
|
+
if not j.status.is_finished()
|
|
776
|
+
and group.name is not None
|
|
777
|
+
and job_belongs_to_group(job=j, group_name=group.name)
|
|
778
|
+
}
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
# Start new up-to-date replicas if needed
|
|
782
|
+
if non_terminated_replica_count < group_max_replica_count:
|
|
783
|
+
active_replicas, inactive_replicas = build_replica_lists(
|
|
784
|
+
run_model=run_model,
|
|
785
|
+
group_filter=group.name,
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
await scale_run_replicas_for_group(
|
|
789
|
+
session=session,
|
|
790
|
+
run_model=run_model,
|
|
791
|
+
group=group,
|
|
792
|
+
replicas_diff=group_max_replica_count - non_terminated_replica_count,
|
|
793
|
+
run_spec=run_spec,
|
|
794
|
+
active_replicas=active_replicas,
|
|
795
|
+
inactive_replicas=inactive_replicas,
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
# Stop out-of-date replicas that are not registered
|
|
799
|
+
replicas_to_stop_count = 0
|
|
800
|
+
for _, jobs in group_jobs_by_replica_latest(run_model.jobs):
|
|
801
|
+
assert group.name is not None, "Group name is always set"
|
|
802
|
+
if not job_belongs_to_group(jobs[0], group.name):
|
|
803
|
+
continue
|
|
804
|
+
# Check if replica is out-of-date and not registered
|
|
805
|
+
if (
|
|
806
|
+
any(j.deployment_num < run_model.deployment_num for j in jobs)
|
|
807
|
+
and any(
|
|
808
|
+
j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses()
|
|
809
|
+
for j in jobs
|
|
810
|
+
)
|
|
811
|
+
and not is_replica_registered(jobs)
|
|
812
|
+
):
|
|
813
|
+
replicas_to_stop_count += 1
|
|
814
|
+
|
|
815
|
+
# Stop excessive registered out-of-date replicas
|
|
816
|
+
non_terminating_registered_replicas_count = 0
|
|
817
|
+
for _, jobs in group_jobs_by_replica_latest(run_model.jobs):
|
|
818
|
+
assert group.name is not None, "Group name is always set"
|
|
819
|
+
if not job_belongs_to_group(jobs[0], group.name):
|
|
820
|
+
continue
|
|
821
|
+
|
|
822
|
+
if is_replica_registered(jobs) and all(j.status != JobStatus.TERMINATING for j in jobs):
|
|
823
|
+
non_terminating_registered_replicas_count += 1
|
|
824
|
+
|
|
825
|
+
replicas_to_stop_count += max(0, non_terminating_registered_replicas_count - group_desired)
|
|
826
|
+
|
|
827
|
+
if replicas_to_stop_count > 0:
|
|
828
|
+
# Build lists again to get current state
|
|
829
|
+
active_replicas, inactive_replicas = build_replica_lists(
|
|
830
|
+
run_model=run_model,
|
|
831
|
+
group_filter=group.name,
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
await scale_run_replicas_for_group(
|
|
835
|
+
session=session,
|
|
836
|
+
run_model=run_model,
|
|
837
|
+
group=group,
|
|
838
|
+
replicas_diff=-replicas_to_stop_count,
|
|
839
|
+
run_spec=run_spec,
|
|
840
|
+
active_replicas=active_replicas,
|
|
841
|
+
inactive_replicas=inactive_replicas,
|
|
842
|
+
)
|