skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
"""Scheduler for managed jobs.
|
2
|
+
|
3
|
+
Once managed jobs are submitted via submit_job, the scheduler is responsible for
|
4
|
+
the business logic of deciding when they are allowed to start, and choosing the
|
5
|
+
right one to start. The scheduler will also schedule jobs that are already live
|
6
|
+
but waiting to launch a new task or recover.
|
7
|
+
|
8
|
+
The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
|
9
|
+
be called from any code running on the managed jobs controller instance to
|
10
|
+
trigger scheduling of new jobs if possible. This function should be called
|
11
|
+
immediately after any state change that could result in jobs newly being able to
|
12
|
+
be scheduled.
|
13
|
+
|
14
|
+
The scheduling logic limits the number of running jobs according to two limits:
|
15
|
+
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
16
|
+
once, based on the number of CPUs. (See _get_launch_parallelism.) This the
|
17
|
+
most compute-intensive part of the job lifecycle, which is why we have an
|
18
|
+
additional limit.
|
19
|
+
2. The number of jobs that can be running at any given time, based on the amount
|
20
|
+
of memory. (See _get_job_parallelism.) Since the job controller is doing very
|
21
|
+
little once a job starts (just checking its status periodically), the most
|
22
|
+
significant resource it consumes is memory.
|
23
|
+
|
24
|
+
The state of the scheduler is entirely determined by the schedule_state column
|
25
|
+
of all the jobs in the job_info table. This column should only be modified via
|
26
|
+
the functions defined in this file. We will always hold the lock while modifying
|
27
|
+
this state. See state.ManagedJobScheduleState.
|
28
|
+
|
29
|
+
Nomenclature:
|
30
|
+
- job: same as managed job (may include multiple tasks)
|
31
|
+
- launch/launching: launching a cluster (sky.launch) as part of a job
|
32
|
+
- start/run: create the job controller process for a job
|
33
|
+
- schedule: transition a job to the LAUNCHING state, whether a new job or a job
|
34
|
+
that is already alive
|
35
|
+
- alive: a job controller exists (includes multiple schedule_states: ALIVE,
|
36
|
+
ALIVE_WAITING, LAUNCHING)
|
37
|
+
"""
|
38
|
+
|
39
|
+
from argparse import ArgumentParser
|
40
|
+
import contextlib
|
41
|
+
from functools import lru_cache
|
42
|
+
import os
|
43
|
+
import time
|
44
|
+
|
45
|
+
import filelock
|
46
|
+
import psutil
|
47
|
+
|
48
|
+
from sky import sky_logging
|
49
|
+
from sky.jobs import constants as managed_job_constants
|
50
|
+
from sky.jobs import state
|
51
|
+
from sky.skylet import constants
|
52
|
+
from sky.utils import common_utils
|
53
|
+
from sky.utils import subprocess_utils
|
54
|
+
|
55
|
+
logger = sky_logging.init_logger('sky.jobs.controller')
|
56
|
+
|
57
|
+
# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
|
58
|
+
# parallelism control or updating the schedule_state of any job.
|
59
|
+
# Any code that takes this lock must conclude by calling
|
60
|
+
# maybe_schedule_next_jobs.
|
61
|
+
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
62
|
+
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
63
|
+
|
64
|
+
# Based on testing, assume a running job uses 350MB memory.
|
65
|
+
JOB_MEMORY_MB = 350
|
66
|
+
# Past 2000 simultaneous jobs, we become unstable.
|
67
|
+
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
68
|
+
MAX_JOB_LIMIT = 2000
|
69
|
+
# Number of ongoing launches launches allowed per CPU.
|
70
|
+
LAUNCHES_PER_CPU = 4
|
71
|
+
|
72
|
+
|
73
|
+
@lru_cache(maxsize=1)
|
74
|
+
def _get_lock_path() -> str:
|
75
|
+
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
76
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
77
|
+
return path
|
78
|
+
|
79
|
+
|
80
|
+
def maybe_schedule_next_jobs() -> None:
|
81
|
+
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
82
|
+
|
83
|
+
Here, "schedule" means to select job that is waiting, and allow it to
|
84
|
+
proceed. It does NOT mean to submit a job to the scheduler.
|
85
|
+
|
86
|
+
For newly submitted jobs, scheduling means updating the state of the jobs,
|
87
|
+
and starting the job controller process. For jobs that are already alive but
|
88
|
+
are waiting to launch a new task or recover, just update the state of the
|
89
|
+
job to indicate that the launch can proceed.
|
90
|
+
|
91
|
+
This function transitions jobs into LAUNCHING on a best-effort basis. That
|
92
|
+
is, if we can start any jobs, we will, but if not, we will exit (almost)
|
93
|
+
immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
|
94
|
+
be started now (either because the lock is held, or because there are not
|
95
|
+
enough resources), another call to this function will be made whenever that
|
96
|
+
situation is resolved. (If the lock is held, the lock holder should start
|
97
|
+
the jobs. If there aren't enough resources, the next controller to exit and
|
98
|
+
free up resources should start the jobs.)
|
99
|
+
|
100
|
+
If this function obtains the lock, it will launch as many jobs as possible
|
101
|
+
before releasing the lock. This is what allows other calls to exit
|
102
|
+
immediately if the lock is held, while ensuring that all jobs are started as
|
103
|
+
soon as possible.
|
104
|
+
|
105
|
+
This uses subprocess_utils.launch_new_process_tree() to start the controller
|
106
|
+
processes, which should be safe to call from pretty much any code running on
|
107
|
+
the jobs controller instance. New job controller processes will be detached
|
108
|
+
from the current process and there will not be a parent/child relationship.
|
109
|
+
See launch_new_process_tree for more.
|
110
|
+
"""
|
111
|
+
try:
|
112
|
+
# We must use a global lock rather than a per-job lock to ensure correct
|
113
|
+
# parallelism control. If we cannot obtain the lock, exit immediately.
|
114
|
+
# The current lock holder is expected to launch any jobs it can before
|
115
|
+
# releasing the lock.
|
116
|
+
with filelock.FileLock(_get_lock_path(), blocking=False):
|
117
|
+
while True:
|
118
|
+
maybe_next_job = state.get_waiting_job()
|
119
|
+
if maybe_next_job is None:
|
120
|
+
# Nothing left to start, break from scheduling loop
|
121
|
+
break
|
122
|
+
|
123
|
+
current_state = maybe_next_job['schedule_state']
|
124
|
+
|
125
|
+
assert current_state in (
|
126
|
+
state.ManagedJobScheduleState.ALIVE_WAITING,
|
127
|
+
state.ManagedJobScheduleState.WAITING), maybe_next_job
|
128
|
+
|
129
|
+
# Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
|
130
|
+
# since they will have been submitted and therefore started
|
131
|
+
# first. The requirements to launch in an alive job are more
|
132
|
+
# lenient, so there is no way that we wouldn't be able to launch
|
133
|
+
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
134
|
+
# job.
|
135
|
+
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
136
|
+
if not _can_lauch_in_alive_job():
|
137
|
+
# Can't schedule anything, break from scheduling loop.
|
138
|
+
break
|
139
|
+
elif current_state == state.ManagedJobScheduleState.WAITING:
|
140
|
+
if not _can_start_new_job():
|
141
|
+
# Can't schedule anything, break from scheduling loop.
|
142
|
+
break
|
143
|
+
|
144
|
+
logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
|
145
|
+
state.scheduler_set_launching(maybe_next_job['job_id'],
|
146
|
+
current_state)
|
147
|
+
|
148
|
+
if current_state == state.ManagedJobScheduleState.WAITING:
|
149
|
+
# The job controller has not been started yet. We must start
|
150
|
+
# it.
|
151
|
+
|
152
|
+
job_id = maybe_next_job['job_id']
|
153
|
+
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
154
|
+
|
155
|
+
activate_python_env_cmd = (
|
156
|
+
f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
157
|
+
env_file = maybe_next_job['env_file_path']
|
158
|
+
source_environment_cmd = (f'source {env_file};'
|
159
|
+
if env_file else '')
|
160
|
+
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
161
|
+
f'{dag_yaml_path} --job-id {job_id};')
|
162
|
+
|
163
|
+
# If the command line here is changed, please also update
|
164
|
+
# utils._controller_process_alive. `--job-id X` should be at
|
165
|
+
# the end.
|
166
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
167
|
+
f'{source_environment_cmd}'
|
168
|
+
f'{run_controller_cmd}')
|
169
|
+
|
170
|
+
logs_dir = os.path.expanduser(
|
171
|
+
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
172
|
+
os.makedirs(logs_dir, exist_ok=True)
|
173
|
+
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
174
|
+
|
175
|
+
pid = subprocess_utils.launch_new_process_tree(
|
176
|
+
run_cmd, log_output=log_path)
|
177
|
+
state.set_job_controller_pid(job_id, pid)
|
178
|
+
|
179
|
+
logger.debug(f'Job {job_id} started with pid {pid}')
|
180
|
+
|
181
|
+
except filelock.Timeout:
|
182
|
+
# If we can't get the lock, just exit. The process holding the lock
|
183
|
+
# should launch any pending jobs.
|
184
|
+
pass
|
185
|
+
|
186
|
+
|
187
|
+
def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
|
188
|
+
"""Submit an existing job to the scheduler.
|
189
|
+
|
190
|
+
This should be called after a job is created in the `spot` table as
|
191
|
+
PENDING. It will tell the scheduler to try and start the job controller, if
|
192
|
+
there are resources available. It may block to acquire the lock, so it
|
193
|
+
should not be on the critical path for `sky jobs launch -d`.
|
194
|
+
|
195
|
+
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
196
|
+
"""
|
197
|
+
with filelock.FileLock(_get_lock_path()):
|
198
|
+
state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
|
199
|
+
common_utils.get_user_hash())
|
200
|
+
maybe_schedule_next_jobs()
|
201
|
+
|
202
|
+
|
203
|
+
@contextlib.contextmanager
|
204
|
+
def scheduled_launch(job_id: int):
|
205
|
+
"""Launch as part of an ongoing job.
|
206
|
+
|
207
|
+
A newly started job will already be LAUNCHING, and this will immediately
|
208
|
+
enter the context.
|
209
|
+
|
210
|
+
If a job is ongoing (ALIVE schedule_state), there are two scenarios where we
|
211
|
+
may need to call sky.launch again during the course of a job controller:
|
212
|
+
- for tasks after the first task
|
213
|
+
- for recovery
|
214
|
+
|
215
|
+
This function will mark the job as ALIVE_WAITING, which indicates to the
|
216
|
+
scheduler that it wants to transition back to LAUNCHING. Then, it will wait
|
217
|
+
until the scheduler transitions the job state, before entering the context.
|
218
|
+
|
219
|
+
On exiting the context, the job will transition to ALIVE.
|
220
|
+
|
221
|
+
This should only be used within the job controller for the given job_id. If
|
222
|
+
multiple uses of this context are nested, behavior is undefined. Don't do
|
223
|
+
that.
|
224
|
+
"""
|
225
|
+
|
226
|
+
# If we're already in LAUNCHING schedule_state, we don't need to wait.
|
227
|
+
# This may be the case for the first launch of a job.
|
228
|
+
if (state.get_job_schedule_state(job_id) !=
|
229
|
+
state.ManagedJobScheduleState.LAUNCHING):
|
230
|
+
# Since we aren't LAUNCHING, we need to wait to be scheduled.
|
231
|
+
_set_alive_waiting(job_id)
|
232
|
+
|
233
|
+
while (state.get_job_schedule_state(job_id) !=
|
234
|
+
state.ManagedJobScheduleState.LAUNCHING):
|
235
|
+
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
236
|
+
|
237
|
+
yield
|
238
|
+
|
239
|
+
with filelock.FileLock(_get_lock_path()):
|
240
|
+
state.scheduler_set_alive(job_id)
|
241
|
+
maybe_schedule_next_jobs()
|
242
|
+
|
243
|
+
|
244
|
+
def job_done(job_id: int, idempotent: bool = False) -> None:
|
245
|
+
"""Transition a job to DONE.
|
246
|
+
|
247
|
+
If idempotent is True, this will not raise an error if the job is already
|
248
|
+
DONE.
|
249
|
+
|
250
|
+
The job could be in any terminal ManagedJobStatus. However, once DONE, it
|
251
|
+
should never transition back to another state.
|
252
|
+
"""
|
253
|
+
if idempotent and (state.get_job_schedule_state(job_id)
|
254
|
+
== state.ManagedJobScheduleState.DONE):
|
255
|
+
return
|
256
|
+
|
257
|
+
with filelock.FileLock(_get_lock_path()):
|
258
|
+
state.scheduler_set_done(job_id, idempotent)
|
259
|
+
maybe_schedule_next_jobs()
|
260
|
+
|
261
|
+
|
262
|
+
def _set_alive_waiting(job_id: int) -> None:
|
263
|
+
"""Should use wait_until_launch_okay() to transition to this state."""
|
264
|
+
with filelock.FileLock(_get_lock_path()):
|
265
|
+
state.scheduler_set_alive_waiting(job_id)
|
266
|
+
maybe_schedule_next_jobs()
|
267
|
+
|
268
|
+
|
269
|
+
def _get_job_parallelism() -> int:
|
270
|
+
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
271
|
+
|
272
|
+
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
273
|
+
|
274
|
+
return max(job_limit, 1)
|
275
|
+
|
276
|
+
|
277
|
+
def _get_launch_parallelism() -> int:
|
278
|
+
cpus = os.cpu_count()
|
279
|
+
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
280
|
+
|
281
|
+
|
282
|
+
def _can_start_new_job() -> bool:
|
283
|
+
launching_jobs = state.get_num_launching_jobs()
|
284
|
+
alive_jobs = state.get_num_alive_jobs()
|
285
|
+
return launching_jobs < _get_launch_parallelism(
|
286
|
+
) and alive_jobs < _get_job_parallelism()
|
287
|
+
|
288
|
+
|
289
|
+
def _can_lauch_in_alive_job() -> bool:
|
290
|
+
launching_jobs = state.get_num_launching_jobs()
|
291
|
+
return launching_jobs < _get_launch_parallelism()
|
292
|
+
|
293
|
+
|
294
|
+
if __name__ == '__main__':
|
295
|
+
parser = ArgumentParser()
|
296
|
+
parser.add_argument('dag_yaml',
|
297
|
+
type=str,
|
298
|
+
help='The path to the user job yaml file.')
|
299
|
+
parser.add_argument('--job-id',
|
300
|
+
required=True,
|
301
|
+
type=int,
|
302
|
+
help='Job id for the controller job.')
|
303
|
+
parser.add_argument('--env-file',
|
304
|
+
type=str,
|
305
|
+
help='The path to the controller env file.')
|
306
|
+
args = parser.parse_args()
|
307
|
+
submit_job(args.job_id, args.dag_yaml, args.env_file)
|
@@ -0,0 +1 @@
|
|
1
|
+
|