skypilot-nightly 1.0.0.dev20251011__py3-none-any.whl → 1.0.0.dev20251013__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (52) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/shadeform.py +89 -0
  3. sky/authentication.py +43 -0
  4. sky/backends/backend_utils.py +2 -0
  5. sky/backends/cloud_vm_ray_backend.py +4 -2
  6. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  7. sky/catalog/shadeform_catalog.py +165 -0
  8. sky/client/cli/command.py +44 -3
  9. sky/client/sdk.py +11 -3
  10. sky/clouds/__init__.py +2 -0
  11. sky/clouds/shadeform.py +393 -0
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/{webpack-66f23594d38c7f16.js → webpack-ac3a34c8f9fef041.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/provision/__init__.py +1 -0
  30. sky/provision/shadeform/__init__.py +11 -0
  31. sky/provision/shadeform/config.py +12 -0
  32. sky/provision/shadeform/instance.py +351 -0
  33. sky/provision/shadeform/shadeform_utils.py +83 -0
  34. sky/serve/constants.py +0 -3
  35. sky/serve/service_spec.py +1 -8
  36. sky/server/constants.py +4 -0
  37. sky/server/requests/executor.py +22 -2
  38. sky/server/requests/payloads.py +2 -0
  39. sky/server/requests/requests.py +119 -2
  40. sky/server/server.py +17 -6
  41. sky/setup_files/dependencies.py +1 -0
  42. sky/skylet/constants.py +1 -1
  43. sky/templates/shadeform-ray.yml.j2 +72 -0
  44. sky/utils/context_utils.py +13 -9
  45. {skypilot_nightly-1.0.0.dev20251011.dist-info → skypilot_nightly-1.0.0.dev20251013.dist-info}/METADATA +43 -41
  46. {skypilot_nightly-1.0.0.dev20251011.dist-info → skypilot_nightly-1.0.0.dev20251013.dist-info}/RECORD +52 -43
  47. /sky/dashboard/out/_next/static/{Xs6jdcfyNaUuBO8jmzU9_ → MtlDUf-nH1hhcy7xwbCj3}/_buildManifest.js +0 -0
  48. /sky/dashboard/out/_next/static/{Xs6jdcfyNaUuBO8jmzU9_ → MtlDUf-nH1hhcy7xwbCj3}/_ssgManifest.js +0 -0
  49. {skypilot_nightly-1.0.0.dev20251011.dist-info → skypilot_nightly-1.0.0.dev20251013.dist-info}/WHEEL +0 -0
  50. {skypilot_nightly-1.0.0.dev20251011.dist-info → skypilot_nightly-1.0.0.dev20251013.dist-info}/entry_points.txt +0 -0
  51. {skypilot_nightly-1.0.0.dev20251011.dist-info → skypilot_nightly-1.0.0.dev20251013.dist-info}/licenses/LICENSE +0 -0
  52. {skypilot_nightly-1.0.0.dev20251011.dist-info → skypilot_nightly-1.0.0.dev20251013.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-835d14ba94808f79.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ac3a34c8f9fef041.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/volumes-835d14ba94808f79.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/volumes","query":{},"buildId":"MtlDUf-nH1hhcy7xwbCj3","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ac3a34c8f9fef041.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"MtlDUf-nH1hhcy7xwbCj3","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-66237729cdf9749e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-d0782b9251f0fcd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-7e0e8f06bb2f881c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-3b40c39626f99c89.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-e8688c35c06f0ac5.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ac3a34c8f9fef041.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7359-c8d04e06886000b3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-66237729cdf9749e.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-f6818c84ed8f1c86.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-d0782b9251f0fcd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-7e0e8f06bb2f881c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-3b40c39626f99c89.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-e8688c35c06f0ac5.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"MtlDUf-nH1hhcy7xwbCj3","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-66f23594d38c7f16.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-69c80d677d3c2949.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Xs6jdcfyNaUuBO8jmzU9_/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"Xs6jdcfyNaUuBO8jmzU9_","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ac3a34c8f9fef041.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-69c80d677d3c2949.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/MtlDUf-nH1hhcy7xwbCj3/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"MtlDUf-nH1hhcy7xwbCj3","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/provision/__init__.py CHANGED
@@ -28,6 +28,7 @@ from sky.provision import primeintellect
28
28
  from sky.provision import runpod
29
29
  from sky.provision import scp
30
30
  from sky.provision import seeweb
31
+ from sky.provision import shadeform
31
32
  from sky.provision import ssh
32
33
  from sky.provision import vast
33
34
  from sky.provision import vsphere
@@ -0,0 +1,11 @@
1
+ """Shadeform provisioner."""
2
+
3
+ from sky.provision.shadeform.config import bootstrap_instances
4
+ from sky.provision.shadeform.instance import cleanup_ports
5
+ from sky.provision.shadeform.instance import get_cluster_info
6
+ from sky.provision.shadeform.instance import open_ports
7
+ from sky.provision.shadeform.instance import query_instances
8
+ from sky.provision.shadeform.instance import run_instances
9
+ from sky.provision.shadeform.instance import stop_instances
10
+ from sky.provision.shadeform.instance import terminate_instances
11
+ from sky.provision.shadeform.instance import wait_instances
@@ -0,0 +1,12 @@
1
+ """Shadeform configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+
12
+ return config
@@ -0,0 +1,351 @@
1
+ """Shadeform instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import requests
6
+
7
+ from sky import sky_logging
8
+ from sky.provision import common
9
+ from sky.provision.shadeform import shadeform_utils
10
+ from sky.utils import status_lib
11
+
12
+ POLL_INTERVAL = 10
13
+ INSTANCE_READY_TIMEOUT = 3600
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ # Status mapping from Shadeform to SkyPilot
18
+ SHADEFORM_STATUS_MAP = {
19
+ 'creating': status_lib.ClusterStatus.INIT,
20
+ 'pending_provider': status_lib.ClusterStatus.INIT,
21
+ 'pending': status_lib.ClusterStatus.INIT,
22
+ 'active': status_lib.ClusterStatus.UP,
23
+ 'deleted': status_lib.ClusterStatus.STOPPED,
24
+ }
25
+
26
+
27
+ def _get_cluster_instances(cluster_name_on_cloud: str) -> Dict[str, Any]:
28
+ """Get all instances belonging to a cluster."""
29
+ try:
30
+ response = shadeform_utils.get_instances()
31
+ instances = response.get('instances', [])
32
+
33
+ cluster_instances = {}
34
+ possible_names = [
35
+ f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
36
+ ]
37
+
38
+ for instance in instances:
39
+ if instance.get('name') in possible_names:
40
+ cluster_instances[instance['id']] = instance
41
+
42
+ return cluster_instances
43
+ except (ValueError, KeyError, requests.exceptions.RequestException) as e:
44
+ logger.warning(f'Failed to get instances: {e}')
45
+ return {}
46
+
47
+
48
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
49
+ """Get the head instance ID from a list of instances."""
50
+ for instance_id, instance in instances.items():
51
+ if instance.get('name', '').endswith('-head'):
52
+ return instance_id
53
+ return None
54
+
55
+
56
+ def _wait_for_instances_ready(cluster_name_on_cloud: str,
57
+ expected_count: int,
58
+ timeout: int = INSTANCE_READY_TIMEOUT) -> bool:
59
+ """Wait for instances to be ready (active state with SSH access)."""
60
+ start_time = time.time()
61
+
62
+ while time.time() - start_time < timeout:
63
+ instances = _get_cluster_instances(cluster_name_on_cloud)
64
+ ready_count = 0
65
+
66
+ for instance in instances.values():
67
+ if (instance.get('status') == 'active' and
68
+ instance.get('ip') is not None and
69
+ instance.get('ssh_port') is not None):
70
+ ready_count += 1
71
+
72
+ logger.info(f'Waiting for instances to be ready: '
73
+ f'({ready_count}/{expected_count})')
74
+
75
+ if ready_count >= expected_count:
76
+ return True
77
+
78
+ time.sleep(POLL_INTERVAL)
79
+
80
+ return False
81
+
82
+
83
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
84
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
85
+ """Run instances for the given cluster."""
86
+ del cluster_name # unused - we use cluster_name_on_cloud
87
+ logger.info(f'Running instances for cluster {cluster_name_on_cloud} '
88
+ f'in region {region}')
89
+ logger.debug(f'DEBUG: region type={type(region)}, value={region!r}')
90
+ logger.debug(f'DEBUG: config node_config={config.node_config}')
91
+
92
+ # Check existing instances
93
+ existing_instances = _get_cluster_instances(cluster_name_on_cloud)
94
+ head_instance_id = _get_head_instance_id(existing_instances)
95
+
96
+ # Filter active instances
97
+ active_instances = {
98
+ iid: inst
99
+ for iid, inst in existing_instances.items()
100
+ if inst.get('status') == 'active'
101
+ }
102
+
103
+ current_count = len(active_instances)
104
+ target_count = config.count
105
+
106
+ logger.info(f'Current instances: {current_count}, target: {target_count}')
107
+
108
+ if current_count >= target_count:
109
+ if head_instance_id is None:
110
+ raise RuntimeError(
111
+ f'Cluster {cluster_name_on_cloud} has no head node')
112
+ logger.info(f'Cluster already has {current_count} instances, '
113
+ f'no need to start more')
114
+ return common.ProvisionRecord(
115
+ provider_name='shadeform',
116
+ cluster_name=cluster_name_on_cloud,
117
+ region=region,
118
+ zone=None, # Shadeform doesn't use separate zones
119
+ head_instance_id=head_instance_id,
120
+ resumed_instance_ids=[],
121
+ created_instance_ids=[])
122
+
123
+ # Create new instances
124
+ to_create = target_count - current_count
125
+ created_instance_ids = []
126
+
127
+ for _ in range(to_create):
128
+ node_type = 'head' if head_instance_id is None else 'worker'
129
+ instance_name = f'{cluster_name_on_cloud}-{node_type}'
130
+
131
+ # Extract configuration from node_config
132
+
133
+ # The node_config contains instance specs including InstanceType
134
+ # which follows the format: {cloud_provider}_{instance_type}
135
+ # (e.g., "massedcompute_A6000_basex2")
136
+ node_config = config.node_config
137
+ assert 'InstanceType' in node_config, \
138
+ 'InstanceType must be present in node_config'
139
+
140
+ # Parse the instance type to extract cloud provider and instance specs
141
+ # Expected format: "{cloud}_{instance_type}" where cloud is provider
142
+ # (massedcompute, scaleway, lambda, etc.)
143
+ instance_type_full = node_config['InstanceType']
144
+ assert (isinstance(instance_type_full, str) and
145
+ '_' in instance_type_full), \
146
+ f'InstanceType must be in format cloud_instance_type, got: ' \
147
+ f'{instance_type_full}'
148
+
149
+ instance_type_split = instance_type_full.split('_')
150
+ assert len(instance_type_split) >= 2, \
151
+ f'InstanceType must contain at least one underscore, got: ' \
152
+ f'{instance_type_full}'
153
+
154
+ # Extract cloud provider (first part) and instance type (remaining)
155
+ # Example: "massedcompute_A6000-basex2" -> cloud="massedcompute",
156
+ # instance_type="A6000-basex2"
157
+ cloud = instance_type_split[0]
158
+ instance_type = '_'.join(instance_type_split[1:])
159
+
160
+ # Shadeform uses underscores instead of hyphens
161
+ instance_type = instance_type.replace('-', '_')
162
+
163
+ if instance_type.endswith('B'):
164
+ instance_type = instance_type[:-1]
165
+
166
+ # Replace "GBx" with "Gx" (case sensitive)
167
+ if 'GBx' in instance_type:
168
+ instance_type = instance_type.replace('GBx', 'Gx')
169
+
170
+ assert cloud, 'Cloud provider cannot be empty'
171
+ assert instance_type, 'Instance type cannot be empty'
172
+
173
+ # Get SSH key ID for authentication - this is optional and may be None
174
+ ssh_key_id = config.authentication_config.get('ssh_key_id')
175
+
176
+ create_config = {
177
+ 'cloud': cloud,
178
+ 'region': region,
179
+ 'shade_instance_type': instance_type,
180
+ 'name': instance_name,
181
+ 'ssh_key_id': ssh_key_id
182
+ }
183
+
184
+ try:
185
+ logger.info(f'Creating {node_type} instance: {instance_name}')
186
+ response = shadeform_utils.create_instance(create_config)
187
+ instance_id = response['id']
188
+ created_instance_ids.append(instance_id)
189
+
190
+ if head_instance_id is None:
191
+ head_instance_id = instance_id
192
+
193
+ logger.info(f'Created instance {instance_id} ({node_type})')
194
+
195
+ except Exception as e:
196
+ logger.error(f'Failed to create instance: {e}')
197
+ # Clean up any created instances
198
+ for iid in created_instance_ids:
199
+ try:
200
+ shadeform_utils.delete_instance(iid)
201
+ except requests.exceptions.RequestException as cleanup_e:
202
+ logger.warning(
203
+ f'Failed to cleanup instance {iid}: {cleanup_e}')
204
+ raise
205
+
206
+ # Wait for all instances to be ready
207
+ logger.info('Waiting for instances to become ready...')
208
+ if not _wait_for_instances_ready(cluster_name_on_cloud, target_count):
209
+ raise RuntimeError('Timed out waiting for instances to be ready')
210
+
211
+ assert head_instance_id is not None, 'head_instance_id should not be None'
212
+
213
+ return common.ProvisionRecord(provider_name='shadeform',
214
+ cluster_name=cluster_name_on_cloud,
215
+ region=region,
216
+ zone=region,
217
+ head_instance_id=head_instance_id,
218
+ resumed_instance_ids=[],
219
+ created_instance_ids=created_instance_ids)
220
+
221
+
222
+ def wait_instances(region: str, cluster_name_on_cloud: str,
223
+ state: Optional[status_lib.ClusterStatus]) -> None:
224
+ """Wait for instances to reach the specified state."""
225
+ del region, cluster_name_on_cloud, state # unused
226
+ # For Shadeform, instances are ready when they reach 'active' status
227
+ # This is already handled in run_instances
228
+
229
+
230
+ def stop_instances(cluster_name_on_cloud: str,
231
+ provider_config: Optional[Dict[str, Any]] = None,
232
+ worker_only: bool = False) -> None:
233
+ """Stop instances (not supported by Shadeform)."""
234
+ del cluster_name_on_cloud, provider_config, worker_only # unused
235
+ raise NotImplementedError(
236
+ 'Stopping instances is not supported by Shadeform')
237
+
238
+
239
+ def terminate_instances(cluster_name_on_cloud: str,
240
+ provider_config: Optional[Dict[str, Any]] = None,
241
+ worker_only: bool = False) -> None:
242
+ """Terminate instances."""
243
+ del provider_config # unused
244
+ logger.info(f'Terminating instances for cluster {cluster_name_on_cloud}')
245
+
246
+ instances = _get_cluster_instances(cluster_name_on_cloud)
247
+
248
+ if not instances:
249
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
250
+ return
251
+
252
+ instances_to_delete = instances
253
+ if worker_only:
254
+ # Only delete worker nodes, not head
255
+ instances_to_delete = {
256
+ iid: inst
257
+ for iid, inst in instances.items()
258
+ if not inst.get('name', '').endswith('-head')
259
+ }
260
+
261
+ for instance_id, instance in instances_to_delete.items():
262
+ try:
263
+ logger.info(
264
+ f'Terminating instance {instance_id} ({instance.get("name")})')
265
+ shadeform_utils.delete_instance(instance_id)
266
+ except requests.exceptions.RequestException as e:
267
+ logger.warning(f'Failed to terminate instance {instance_id}: {e}')
268
+
269
+
270
+ def get_cluster_info(
271
+ region: str,
272
+ cluster_name_on_cloud: str,
273
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
274
+ """Get cluster information."""
275
+ del region, provider_config # unused
276
+ instances = _get_cluster_instances(cluster_name_on_cloud)
277
+
278
+ if not instances:
279
+ return common.ClusterInfo(instances={},
280
+ head_instance_id=None,
281
+ provider_name='shadeform')
282
+
283
+ head_instance_id = _get_head_instance_id(instances)
284
+
285
+ # Convert instance format for ClusterInfo
286
+ cluster_instances = {}
287
+ for instance_id, instance in instances.items():
288
+ instance_info = common.InstanceInfo(
289
+ instance_id=instance_id,
290
+ internal_ip=instance.get('ip', ''),
291
+ external_ip=instance.get('ip', ''),
292
+ ssh_port=instance.get('ssh_port', 22),
293
+ tags={},
294
+ )
295
+ # ClusterInfo expects Dict[InstanceId, List[InstanceInfo]]
296
+ cluster_instances[instance_id] = [instance_info]
297
+
298
+ ssh_user = 'shadeform' # default
299
+ if head_instance_id is not None:
300
+ ssh_user = instances.get(head_instance_id,
301
+ {}).get('ssh_user', 'shadeform')
302
+
303
+ return common.ClusterInfo(instances=cluster_instances,
304
+ head_instance_id=head_instance_id,
305
+ provider_name='shadeform',
306
+ ssh_user=ssh_user)
307
+
308
+
309
+ def query_instances(
310
+ cluster_name: str,
311
+ cluster_name_on_cloud: str,
312
+ provider_config: Optional[Dict[str, Any]] = None,
313
+ non_terminated_only: bool = True,
314
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
315
+ """Query the status of instances."""
316
+ del cluster_name, provider_config # unused
317
+ instances = _get_cluster_instances(cluster_name_on_cloud)
318
+
319
+ if not instances:
320
+ return {}
321
+
322
+ status_map: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
323
+ Optional[str]]] = {}
324
+ for instance_id, instance in instances.items():
325
+ shadeform_status = instance.get('status', 'unknown')
326
+ sky_status = SHADEFORM_STATUS_MAP.get(shadeform_status,
327
+ status_lib.ClusterStatus.INIT)
328
+
329
+ if (non_terminated_only and
330
+ sky_status == status_lib.ClusterStatus.STOPPED):
331
+ continue
332
+
333
+ status_map[instance_id] = (sky_status, None)
334
+
335
+ return status_map
336
+
337
+
338
+ def open_ports(cluster_name_on_cloud: str,
339
+ ports: List[str],
340
+ provider_config: Optional[Dict[str, Any]] = None) -> None:
341
+ """Open ports (not supported by Shadeform)."""
342
+ del cluster_name_on_cloud, ports, provider_config # unused
343
+ raise NotImplementedError()
344
+
345
+
346
+ def cleanup_ports(cluster_name_on_cloud: str,
347
+ ports: List[str],
348
+ provider_config: Optional[Dict[str, Any]] = None) -> None:
349
+ """Cleanup ports (not supported by Shadeform)."""
350
+ del cluster_name_on_cloud, ports, provider_config # unused
351
+ # Nothing to cleanup since we don't support dynamic port opening
@@ -0,0 +1,83 @@
1
+ """Shadeform API utilities."""
2
+
3
+ import os
4
+ from typing import Any, Dict
5
+
6
+ from sky.adaptors import common
7
+
8
+ # Lazy import to avoid dependency on external packages
9
+ requests = common.LazyImport('requests')
10
+
11
+ # Shadeform API configuration
12
+ SHADEFORM_API_BASE = 'https://api.shadeform.ai/v1'
13
+ SHADEFORM_API_KEY_PATH = '~/.shadeform/api_key'
14
+
15
+
16
+ def get_api_key() -> str:
17
+ """Get Shadeform API key from file."""
18
+ api_key_path = os.path.expanduser(SHADEFORM_API_KEY_PATH)
19
+ if not os.path.exists(api_key_path):
20
+ raise FileNotFoundError(
21
+ f'Shadeform API key not found at {api_key_path}. '
22
+ 'Please save your API key to this file.')
23
+
24
+ with open(api_key_path, 'r', encoding='utf-8') as f:
25
+ api_key = f.read().strip()
26
+
27
+ if not api_key:
28
+ raise ValueError(f'Shadeform API key is empty in {api_key_path}')
29
+
30
+ return api_key
31
+
32
+
33
+ def make_request(method: str, endpoint: str, **kwargs) -> Any:
34
+ """Make a request to the Shadeform API."""
35
+ url = f'{SHADEFORM_API_BASE}/{endpoint.lstrip("/")}'
36
+ headers = {
37
+ 'X-API-KEY': get_api_key(),
38
+ 'Content-Type': 'application/json',
39
+ }
40
+
41
+ response = requests.request(method, url, headers=headers, **kwargs)
42
+ response.raise_for_status()
43
+
44
+ # Some APIs (like delete) return empty responses with just 200 status
45
+ if response.text.strip():
46
+ return response.json()
47
+ else:
48
+ # Return empty dict for empty responses (e.g., delete operations)
49
+ return {}
50
+
51
+
52
+ def get_instances() -> Dict[str, Any]:
53
+ """Get all instances."""
54
+ return make_request('GET', '/instances')
55
+
56
+
57
+ def get_instance_info(instance_id: str) -> Dict[str, Any]:
58
+ """Get information about a specific instance."""
59
+ return make_request('GET', f'/instances/{instance_id}/info')
60
+
61
+
62
+ def create_instance(config: Dict[str, Any]) -> Dict[str, Any]:
63
+ """Create a new instance."""
64
+ return make_request('POST', '/instances/create', json=config)
65
+
66
+
67
+ def delete_instance(instance_id: str) -> Dict[str, Any]:
68
+ """Delete an instance.
69
+
70
+ Note: Shadeform delete API returns empty response with 200 status.
71
+ """
72
+ return make_request('POST', f'/instances/{instance_id}/delete')
73
+
74
+
75
+ def get_ssh_keys() -> Dict[str, Any]:
76
+ """Get all SSH keys."""
77
+ return make_request('GET', '/sshkeys')
78
+
79
+
80
+ def add_ssh_key(name: str, public_key: str) -> Dict[str, Any]:
81
+ """Add a new SSH key."""
82
+ config = {'name': name, 'public_key': public_key}
83
+ return make_request('POST', '/sshkeys/add', json=config)
sky/serve/constants.py CHANGED
@@ -76,9 +76,6 @@ CONTROLLER_AUTOSTOP = {
76
76
  # A period of time to initialize your service. Any readiness probe failures
77
77
  # during this period will be ignored.
78
78
  DEFAULT_INITIAL_DELAY_SECONDS = 1200
79
- # For pool, we shrink the initial delay to 300s to make the pool more
80
- # responsive to the failure that setup command starts a long-running server.
81
- DEFAULT_INITIAL_DELAY_SECONDS_POOL = 300
82
79
  DEFAULT_MIN_REPLICAS = 1
83
80
 
84
81
  # Default port range start for controller and load balancer. Ports will be
sky/serve/service_spec.py CHANGED
@@ -125,12 +125,6 @@ class SkyServiceSpec:
125
125
  self.base_ondemand_fallback_replicas is not None and
126
126
  self.base_ondemand_fallback_replicas > 0)
127
127
 
128
- @staticmethod
129
- def _get_initial_delay_seconds(pool: bool) -> int:
130
- if pool:
131
- return constants.DEFAULT_INITIAL_DELAY_SECONDS_POOL
132
- return constants.DEFAULT_INITIAL_DELAY_SECONDS
133
-
134
128
  @staticmethod
135
129
  def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec':
136
130
  common_utils.validate_schema(config, schemas.get_service_schema(),
@@ -159,8 +153,7 @@ class SkyServiceSpec:
159
153
  'timeout_seconds', None)
160
154
  readiness_headers = readiness_section.get('headers', None)
161
155
  if initial_delay_seconds is None:
162
- initial_delay_seconds = SkyServiceSpec._get_initial_delay_seconds(
163
- config.get('pool', False))
156
+ initial_delay_seconds = constants.DEFAULT_INITIAL_DELAY_SECONDS
164
157
  service_config['initial_delay_seconds'] = initial_delay_seconds
165
158
  if readiness_timeout_seconds is None:
166
159
  readiness_timeout_seconds = (
sky/server/constants.py CHANGED
@@ -64,3 +64,7 @@ DAEMON_RESTART_INTERVAL_SECONDS = 20
64
64
 
65
65
  # Cookie header for stream request id.
66
66
  STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
67
+
68
+ # Valid empty values for pickled fields (base64-encoded pickled None)
69
+ # base64.b64encode(pickle.dumps(None)).decode('utf-8')
70
+ EMPTY_PICKLED_VALUE = 'gAROLg=='
@@ -81,6 +81,26 @@ logger = sky_logging.init_logger(__name__)
81
81
  # platforms, including macOS.
82
82
  multiprocessing.set_start_method('spawn', force=True)
83
83
 
84
+ # Max threads that is equivalent to the number of thread workers in the
85
+ # default thread pool executor of event loop.
86
+ _REQUEST_THREADS_LIMIT = min(32, (os.cpu_count() or 0) + 4)
87
+
88
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
89
+ # A dedicated thread pool executor for synced requests execution in coroutine
90
+ _REQUEST_THREAD_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
91
+
92
+
93
+ def get_request_thread_executor() -> concurrent.futures.ThreadPoolExecutor:
94
+ """Lazy init and return the request thread executor for current process."""
95
+ global _REQUEST_THREAD_EXECUTOR
96
+ if _REQUEST_THREAD_EXECUTOR is not None:
97
+ return _REQUEST_THREAD_EXECUTOR
98
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
99
+ if _REQUEST_THREAD_EXECUTOR is None:
100
+ _REQUEST_THREAD_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
101
+ max_workers=_REQUEST_THREADS_LIMIT)
102
+ return _REQUEST_THREAD_EXECUTOR
103
+
84
104
 
85
105
  class RequestQueue:
86
106
  """The queue for the requests, either redis or multiprocessing.
@@ -576,8 +596,8 @@ async def _execute_request_coroutine(request: api_requests.Request):
576
596
  # 1. skypilot config is not contextual
577
597
  # 2. envs that read directly from os.environ are not contextual
578
598
  ctx.override_envs(request_body.env_vars)
579
- fut: asyncio.Future = context_utils.to_thread(func,
580
- **request_body.to_kwargs())
599
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
600
+ get_request_thread_executor(), func, **request_body.to_kwargs())
581
601
 
582
602
  async def poll_task(request_id: str) -> bool:
583
603
  req_status = await api_requests.get_request_status_async(request_id)
@@ -573,6 +573,8 @@ class RequestStatusBody(pydantic.BaseModel):
573
573
  """The request body for the API request status endpoint."""
574
574
  request_ids: Optional[List[str]] = None
575
575
  all_status: bool = False
576
+ limit: Optional[int] = None
577
+ fields: Optional[List[str]] = None
576
578
 
577
579
 
578
580
  class ServeUpBody(RequestBody):