skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20250513__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +3 -0
- sky/backends/cloud_vm_ray_backend.py +7 -0
- sky/cli.py +109 -109
- sky/client/cli.py +109 -109
- sky/clouds/gcp.py +35 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → 2dkponv64SfFShA8Rnw0D}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/global_user_state.py +2 -0
- sky/provision/docker_utils.py +4 -1
- sky/provision/gcp/config.py +197 -15
- sky/provision/gcp/constants.py +64 -0
- sky/provision/gcp/instance.py +5 -3
- sky/provision/gcp/instance_utils.py +8 -4
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +4 -2
- sky/server/requests/executor.py +114 -22
- sky/server/requests/requests.py +15 -0
- sky/server/server.py +12 -7
- sky/server/uvicorn.py +12 -2
- sky/sky_logging.py +40 -2
- sky/skylet/constants.py +3 -0
- sky/skylet/log_lib.py +51 -11
- sky/templates/gcp-ray.yml.j2 +11 -0
- sky/templates/nebius-ray.yml.j2 +4 -0
- sky/templates/websocket_proxy.py +29 -9
- sky/utils/command_runner.py +3 -0
- sky/utils/context.py +264 -0
- sky/utils/context_utils.py +172 -0
- sky/utils/rich_utils.py +81 -37
- sky/utils/schemas.py +9 -1
- sky/utils/subprocess_utils.py +8 -2
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/METADATA +1 -5
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/RECORD +46 -44
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → 2dkponv64SfFShA8Rnw0D}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/top_level.txt +0 -0
sky/dashboard/out/jobs.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="preload" href="/dashboard/skypilot.svg" as="image" fetchpriority="high"/><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-0f8017370869e269.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-f49500b82ad5392d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js" defer=""></script><script src="/dashboard/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LksQgChY5izXjokL3LcEu/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div class="min-h-screen bg-gray-50"><div class="fixed top-0 left-0 right-0 z-50 shadow-sm"><div class="fixed top-0 left-0 right-0 bg-white z-30 h-14 px-4 border-b border-gray-200 shadow-sm"><div class="flex items-center h-full"><div class="flex items-center space-x-4 mr-6"><a class="flex items-center px-1 pt-1 h-full" href="/dashboard"><div class="h-20 w-20 flex items-center justify-center"><img alt="SkyPilot Logo" fetchpriority="high" width="80" height="80" decoding="async" data-nimg="1" class="w-full h-full object-contain" style="color:transparent" src="/dashboard/skypilot.svg"/></div></a></div><div class="flex items-center space-x-2 md:space-x-6 mr-6"><a class="inline-flex items-center border-b-2 border-transparent hover:text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/clusters"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="20" height="8" x="2" y="2" rx="2" ry="2"></rect><rect width="20" height="8" x="2" y="14" rx="2" ry="2"></rect><line x1="6" x2="6.01" y1="6" y2="6"></line><line x1="6" x2="6.01" y1="18" y2="18"></line></svg><span>Clusters</span></a><a class="inline-flex items-center border-b-2 border-transparent text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/jobs"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 20V4a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path><rect width="20" height="14" x="2" y="6" rx="2"></rect></svg><span>Jobs</span></a><div class="inline-flex items-center px-1 pt-1 text-gray-400"><svg class="w-4 h-4" viewBox="0 0 423.683 423.683" width="24" height="24" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" fill="currentColor" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><g id="SVGRepo_bgCarrier" stroke-width="0"></g><g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"></g><g id="SVGRepo_iconCarrier"><g><path d="M54.376,287.577h310.459c26.48,0,48.02-13.979,48.02-40.453c0-17.916-10.001-34.07-25.559-42.292 c-19.021-72.951-86.061-125.196-162.002-125.223v-3.431h-3.854V61.814h3.854v-9.569h-31.38v9.569h3.854v14.363h-3.854v3.431 c-75.941,0.026-142.97,52.272-161.988,125.217c-15.56,8.216-25.573,24.376-25.573,42.291 C6.36,273.597,27.896,287.577,54.376,287.577z M47.676,227.145l7.214-2.424l1.617-7.447 c13.884-64.232,71.707-110.862,137.467-110.862h31.274c65.763,0,123.582,46.63,137.473,110.862l1.607,7.447l7.223,2.424 c8.678,2.92,14.506,10.946,14.506,19.979c0,11.703-9.517,13.647-21.221,13.647H54.376c-11.7,0-21.22-1.944-21.22-13.647 C33.162,238.091,38.984,230.065,47.676,227.145z M423.683,334.602v36.836H0v-36.836h25.348v-18.418h372.99v18.418H423.683z"></path></g></g></svg><span class="ml-2">Services</span><span class="text-xs ml-2 px-1.5 py-0.5 bg-gray-100 text-gray-500 rounded">Soon</span></div></div><div class="flex items-center space-x-1 ml-auto"><a href="https://skypilot.readthedocs.io/en/latest/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center px-2 py-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer" title="Docs"><span class="mr-1">Docs</span><svg class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"></path><polyline points="15 3 21 3 21 9"></polyline><line x1="10" y1="14" x2="21" y2="3"></line></svg></a><div class="border-l border-gray-200 h-6 mx-1"></div><a href="https://github.com/skypilot-org/skypilot" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="GitHub"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"></path></svg></a><a href="https://slack.skypilot.co/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Slack"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path transform="scale(0.85) translate(1.8, 1.8)" d="M5.042 15.165a2.528 2.528 0 0 1-2.52 2.523A2.528 2.528 0 0 1 0 15.165a2.527 2.527 0 0 1 2.522-2.52h2.52v2.52zM6.313 15.165a2.527 2.527 0 0 1 2.521-2.52 2.527 2.527 0 0 1 2.521 2.52v6.313A2.528 2.528 0 0 1 8.834 24a2.528 2.528 0 0 1-2.521-2.522v-6.313zM8.834 5.042a2.528 2.528 0 0 1-2.521-2.52A2.528 2.528 0 0 1 8.834 0a2.528 2.528 0 0 1 2.521 2.522v2.52H8.834zM8.834 6.313a2.528 2.528 0 0 1 2.521 2.521 2.528 2.528 0 0 1-2.521 2.521H2.522A2.528 2.528 0 0 1 0 8.834a2.528 2.528 0 0 1 2.522-2.521h6.312zM18.956 8.834a2.528 2.528 0 0 1 2.522-2.521A2.528 2.528 0 0 1 24 8.834a2.528 2.528 0 0 1-2.522 2.521h-2.522V8.834zM17.688 8.834a2.528 2.528 0 0 1-2.523 2.521 2.527 2.527 0 0 1-2.52-2.521V2.522A2.527 2.527 0 0 1 15.165 0a2.528 2.528 0 0 1 2.523 2.522v6.312zM15.165 18.956a2.528 2.528 0 0 1 2.523 2.522A2.528 2.528 0 0 1 15.165 24a2.527 2.527 0 0 1-2.52-2.522v-2.522h2.52zM15.165 17.688a2.527 2.527 0 0 1-2.52-2.523 2.526 2.526 0 0 1 2.52-2.52h6.313A2.527 2.527 0 0 1 24 15.165a2.528 2.528 0 0 1-2.522 2.523h-6.313z"></path></svg></a><a href="https://github.com/skypilot-org/skypilot/issues/new" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Leave Feedback"><svg class="w-5 h-5" stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g><path fill="none" d="M0 0h24v24H0z"></path><path d="M6.455 19L2 22.5V4a1 1 0 0 1 1-1h18a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H6.455zM4 18.385L5.763 17H20V5H4v13.385zM11 13h2v2h-2v-2zm0-6h2v5h-2V7z"></path></g></svg></a></div></div></div></div><div class="transition-all duration-200 ease-in-out min-h-screen" style="padding-top:56px"><main class="p-6"><div class="flex items-center justify-between mb-4 h-5"><div class="text-base"><a class="text-sky-blue hover:underline leading-none" href="/dashboard/jobs">Managed Jobs</a></div><div class="flex items-center space-x-2"><button class="inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 hover:bg-accent h-9 rounded-md px-3 text-sky-blue hover:text-sky-blue-bright" title="Refresh"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-rotate-cw h-4 w-4 mr-1.5"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg><span>Refresh</span></button></div></div><div class="relative"><div class="flex flex-col space-y-1 mb-1"><div class="flex flex-wrap items-center text-sm mb-1"><span class="mr-2 text-sm font-medium">Statuses:</span><div class="flex flex-wrap gap-2 items-center"></div></div></div><div class="rounded-lg border bg-card text-card-foreground shadow-sm"><div class="relative w-full overflow-auto"><table class="w-full caption-bottom text-base"><thead class="[&_tr]:border-b"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">ID</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Name</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">User</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Submitted</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Duration</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Status</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Resources</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Cluster</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Region</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Recoveries</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Details</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Logs</th></tr></thead><tbody class="[&_tr:last-child]:border-0"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><td class="p-4 align-middle [&:has([role=checkbox])]:pr-0 text-center py-6" colSpan="12"><div class="flex flex-col items-center space-y-4"><p class="text-gray-500">No active jobs</p></div></td></tr></tbody></table></div></div></div></main></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"LksQgChY5izXjokL3LcEu","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="preload" href="/dashboard/skypilot.svg" as="image" fetchpriority="high"/><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-0ca6f2c1ba667c3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-f49500b82ad5392d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js" defer=""></script><script src="/dashboard/_next/static/2dkponv64SfFShA8Rnw0D/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2dkponv64SfFShA8Rnw0D/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div class="min-h-screen bg-gray-50"><div class="fixed top-0 left-0 right-0 z-50 shadow-sm"><div class="fixed top-0 left-0 right-0 bg-white z-30 h-14 px-4 border-b border-gray-200 shadow-sm"><div class="flex items-center h-full"><div class="flex items-center space-x-4 mr-6"><a class="flex items-center px-1 pt-1 h-full" href="/dashboard"><div class="h-20 w-20 flex items-center justify-center"><img alt="SkyPilot Logo" fetchpriority="high" width="80" height="80" decoding="async" data-nimg="1" class="w-full h-full object-contain" style="color:transparent" src="/dashboard/skypilot.svg"/></div></a></div><div class="flex items-center space-x-2 md:space-x-6 mr-6"><a class="inline-flex items-center border-b-2 border-transparent hover:text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/clusters"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="20" height="8" x="2" y="2" rx="2" ry="2"></rect><rect width="20" height="8" x="2" y="14" rx="2" ry="2"></rect><line x1="6" x2="6.01" y1="6" y2="6"></line><line x1="6" x2="6.01" y1="18" y2="18"></line></svg><span>Clusters</span></a><a class="inline-flex items-center border-b-2 border-transparent text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/jobs"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 20V4a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path><rect width="20" height="14" x="2" y="6" rx="2"></rect></svg><span>Jobs</span></a><div class="inline-flex items-center px-1 pt-1 text-gray-400"><svg class="w-4 h-4" viewBox="0 0 423.683 423.683" width="24" height="24" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" fill="currentColor" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><g id="SVGRepo_bgCarrier" stroke-width="0"></g><g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"></g><g id="SVGRepo_iconCarrier"><g><path d="M54.376,287.577h310.459c26.48,0,48.02-13.979,48.02-40.453c0-17.916-10.001-34.07-25.559-42.292 c-19.021-72.951-86.061-125.196-162.002-125.223v-3.431h-3.854V61.814h3.854v-9.569h-31.38v9.569h3.854v14.363h-3.854v3.431 c-75.941,0.026-142.97,52.272-161.988,125.217c-15.56,8.216-25.573,24.376-25.573,42.291 C6.36,273.597,27.896,287.577,54.376,287.577z M47.676,227.145l7.214-2.424l1.617-7.447 c13.884-64.232,71.707-110.862,137.467-110.862h31.274c65.763,0,123.582,46.63,137.473,110.862l1.607,7.447l7.223,2.424 c8.678,2.92,14.506,10.946,14.506,19.979c0,11.703-9.517,13.647-21.221,13.647H54.376c-11.7,0-21.22-1.944-21.22-13.647 C33.162,238.091,38.984,230.065,47.676,227.145z M423.683,334.602v36.836H0v-36.836h25.348v-18.418h372.99v18.418H423.683z"></path></g></g></svg><span class="ml-2">Services</span><span class="text-xs ml-2 px-1.5 py-0.5 bg-gray-100 text-gray-500 rounded">Soon</span></div></div><div class="flex items-center space-x-1 ml-auto"><a href="https://skypilot.readthedocs.io/en/latest/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center px-2 py-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer" title="Docs"><span class="mr-1">Docs</span><svg class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"></path><polyline points="15 3 21 3 21 9"></polyline><line x1="10" y1="14" x2="21" y2="3"></line></svg></a><div class="border-l border-gray-200 h-6 mx-1"></div><a href="https://github.com/skypilot-org/skypilot" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="GitHub"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"></path></svg></a><a href="https://slack.skypilot.co/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Slack"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path transform="scale(0.85) translate(1.8, 1.8)" d="M5.042 15.165a2.528 2.528 0 0 1-2.52 2.523A2.528 2.528 0 0 1 0 15.165a2.527 2.527 0 0 1 2.522-2.52h2.52v2.52zM6.313 15.165a2.527 2.527 0 0 1 2.521-2.52 2.527 2.527 0 0 1 2.521 2.52v6.313A2.528 2.528 0 0 1 8.834 24a2.528 2.528 0 0 1-2.521-2.522v-6.313zM8.834 5.042a2.528 2.528 0 0 1-2.521-2.52A2.528 2.528 0 0 1 8.834 0a2.528 2.528 0 0 1 2.521 2.522v2.52H8.834zM8.834 6.313a2.528 2.528 0 0 1 2.521 2.521 2.528 2.528 0 0 1-2.521 2.521H2.522A2.528 2.528 0 0 1 0 8.834a2.528 2.528 0 0 1 2.522-2.521h6.312zM18.956 8.834a2.528 2.528 0 0 1 2.522-2.521A2.528 2.528 0 0 1 24 8.834a2.528 2.528 0 0 1-2.522 2.521h-2.522V8.834zM17.688 8.834a2.528 2.528 0 0 1-2.523 2.521 2.527 2.527 0 0 1-2.52-2.521V2.522A2.527 2.527 0 0 1 15.165 0a2.528 2.528 0 0 1 2.523 2.522v6.312zM15.165 18.956a2.528 2.528 0 0 1 2.523 2.522A2.528 2.528 0 0 1 15.165 24a2.527 2.527 0 0 1-2.52-2.522v-2.522h2.52zM15.165 17.688a2.527 2.527 0 0 1-2.52-2.523 2.526 2.526 0 0 1 2.52-2.52h6.313A2.527 2.527 0 0 1 24 15.165a2.528 2.528 0 0 1-2.522 2.523h-6.313z"></path></svg></a><a href="https://github.com/skypilot-org/skypilot/issues/new" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Leave Feedback"><svg class="w-5 h-5" stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g><path fill="none" d="M0 0h24v24H0z"></path><path d="M6.455 19L2 22.5V4a1 1 0 0 1 1-1h18a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H6.455zM4 18.385L5.763 17H20V5H4v13.385zM11 13h2v2h-2v-2zm0-6h2v5h-2V7z"></path></g></svg></a></div></div></div></div><div class="transition-all duration-200 ease-in-out min-h-screen" style="padding-top:56px"><main class="p-6"><div class="flex items-center justify-between mb-4 h-5"><div class="text-base"><a class="text-sky-blue hover:underline leading-none" href="/dashboard/jobs">Managed Jobs</a></div><div class="flex items-center space-x-2"><button class="inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 hover:bg-accent h-9 rounded-md px-3 text-sky-blue hover:text-sky-blue-bright" title="Refresh"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-rotate-cw h-4 w-4 mr-1.5"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg><span>Refresh</span></button></div></div><div class="relative"><div class="flex flex-col space-y-1 mb-1"><div class="flex flex-wrap items-center text-sm mb-1"><span class="mr-2 text-sm font-medium">Statuses:</span><div class="flex flex-wrap gap-2 items-center"></div></div></div><div class="rounded-lg border bg-card text-card-foreground shadow-sm"><div class="relative w-full overflow-auto"><table class="w-full caption-bottom text-base"><thead class="[&_tr]:border-b"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">ID</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Name</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">User</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Submitted</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Duration</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Status</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Resources</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Cluster</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Region</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Recoveries</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Details</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Logs</th></tr></thead><tbody class="[&_tr:last-child]:border-0"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><td class="p-4 align-middle [&:has([role=checkbox])]:pr-0 text-center py-6" colSpan="12"><div class="flex flex-col items-center space-y-4"><p class="text-gray-500">No active jobs</p></div></td></tr></tbody></table></div></div></div></main></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"2dkponv64SfFShA8Rnw0D","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
@@ -19,6 +19,7 @@ import uuid
|
|
19
19
|
from sky import models
|
20
20
|
from sky import sky_logging
|
21
21
|
from sky.utils import common_utils
|
22
|
+
from sky.utils import context_utils
|
22
23
|
from sky.utils import db_utils
|
23
24
|
from sky.utils import registry
|
24
25
|
from sky.utils import status_lib
|
@@ -671,6 +672,7 @@ def _load_storage_mounts_metadata(
|
|
671
672
|
return pickle.loads(record_storage_mounts_metadata)
|
672
673
|
|
673
674
|
|
675
|
+
@context_utils.cancellation_guard
|
674
676
|
def get_cluster_from_name(
|
675
677
|
cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
|
676
678
|
rows = _DB.cursor.execute(
|
sky/provision/docker_utils.py
CHANGED
@@ -343,9 +343,12 @@ class DockerInitializer:
|
|
343
343
|
# `mesg: ttyname failed: inappropriate ioctl for device`.
|
344
344
|
# see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
345
345
|
port = constants.DEFAULT_DOCKER_PORT
|
346
|
+
# In case the port is already configured in the sshd_config file
|
347
|
+
# in some images, we delete it first and then append the new one.
|
346
348
|
# pylint: disable=anomalous-backslash-in-string
|
347
349
|
self._run(
|
348
|
-
|
350
|
+
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
351
|
+
f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
|
349
352
|
'mkdir -p ~/.ssh;'
|
350
353
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
351
354
|
'sudo service ssh start;'
|
sky/provision/gcp/config.py
CHANGED
@@ -75,6 +75,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
|
|
75
75
|
return result
|
76
76
|
|
77
77
|
|
78
|
+
def wait_for_compute_region_operation(project_name, region, operation, compute):
|
79
|
+
"""Poll for region compute operation until finished."""
|
80
|
+
logger.info('wait_for_compute_region_operation: '
|
81
|
+
'Waiting for operation {} to finish...'.format(
|
82
|
+
operation['name']))
|
83
|
+
|
84
|
+
for _ in range(constants.MAX_POLLS):
|
85
|
+
result = (compute.regionOperations().get(
|
86
|
+
project=project_name,
|
87
|
+
region=region,
|
88
|
+
operation=operation['name'],
|
89
|
+
).execute())
|
90
|
+
if 'error' in result:
|
91
|
+
raise Exception(result['error'])
|
92
|
+
|
93
|
+
if result['status'] == 'DONE':
|
94
|
+
logger.info('wait_for_compute_region_operation: Operation done.')
|
95
|
+
break
|
96
|
+
|
97
|
+
time.sleep(constants.POLL_INTERVAL)
|
98
|
+
|
99
|
+
return result
|
100
|
+
|
101
|
+
|
78
102
|
def _create_crm(gcp_credentials=None):
|
79
103
|
return gcp.build('cloudresourcemanager',
|
80
104
|
'v1',
|
@@ -168,6 +192,7 @@ def bootstrap_instances(
|
|
168
192
|
iam_role = _configure_iam_role(config, crm, iam)
|
169
193
|
config.node_config.update(iam_role)
|
170
194
|
config = _configure_subnet(region, cluster_name, config, compute)
|
195
|
+
config = _configure_placement_policy(region, cluster_name, config, compute)
|
171
196
|
|
172
197
|
return config
|
173
198
|
|
@@ -660,6 +685,95 @@ def get_usable_vpc_and_subnet(
|
|
660
685
|
return usable_vpc_name, usable_subnet
|
661
686
|
|
662
687
|
|
688
|
+
def get_gpu_direct_usable_vpcs_and_subnets(
|
689
|
+
cluster_name: str,
|
690
|
+
region: str,
|
691
|
+
config: common.ProvisionConfig,
|
692
|
+
compute,
|
693
|
+
) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
|
694
|
+
"""Return a list of usable VPCs and subnets for GPU Direct."""
|
695
|
+
project_id = config.provider_config['project_id']
|
696
|
+
vpc_prefix = constants.SKYPILOT
|
697
|
+
cluster_prefix = cluster_name[:constants.CLUSTER_PREFIX_LENGTH]
|
698
|
+
vpc_subnet_pairs = []
|
699
|
+
|
700
|
+
# TODO(hailong): Determine the num_vpcs per different GPU Direct types
|
701
|
+
num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
|
702
|
+
|
703
|
+
cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
|
704
|
+
for i in range(num_vpcs):
|
705
|
+
if i == 0:
|
706
|
+
vpc_name = f'{vpc_prefix}-{cluster_prefix}-mgmt-net'
|
707
|
+
else:
|
708
|
+
vpc_name = f'{vpc_prefix}-{cluster_prefix}-data-net-{i}'
|
709
|
+
subnet_name = f'{vpc_name}-sub'
|
710
|
+
subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
|
711
|
+
# Check if VPC exists
|
712
|
+
vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
|
713
|
+
if not vpc_list:
|
714
|
+
body = constants.VPC_TEMPLATE.copy()
|
715
|
+
body['mtu'] = 8244
|
716
|
+
body['autoCreateSubnetworks'] = False
|
717
|
+
body['name'] = vpc_name
|
718
|
+
body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
|
719
|
+
VPC_NAME=vpc_name)
|
720
|
+
_create_vpcnet(project_id, compute, body)
|
721
|
+
# Check if subnet exists
|
722
|
+
subnets = _list_subnets(project_id, region, compute, network=vpc_name)
|
723
|
+
if not subnets:
|
724
|
+
_create_subnet(project_id, region, compute, vpc_name, subnet_name,
|
725
|
+
subnet_cidr_range)
|
726
|
+
subnets = _list_subnets(project_id,
|
727
|
+
region,
|
728
|
+
compute,
|
729
|
+
network=vpc_name)
|
730
|
+
# Apply firewall rules
|
731
|
+
_create_rules(project_id, compute, constants.FIREWALL_RULES_TEMPLATE,
|
732
|
+
vpc_name)
|
733
|
+
vpc_subnet_pairs.append((vpc_name, subnets[0]))
|
734
|
+
return vpc_subnet_pairs
|
735
|
+
|
736
|
+
|
737
|
+
def _configure_placement_policy(region: str, cluster_name: str,
|
738
|
+
config: common.ProvisionConfig, compute):
|
739
|
+
"""Configure placement group for GPU Direct."""
|
740
|
+
node_config = config.node_config
|
741
|
+
project_id = config.provider_config['project_id']
|
742
|
+
group_placement_policy = config.provider_config.get('placement_policy',
|
743
|
+
None)
|
744
|
+
# If the placement policy is not compact,
|
745
|
+
# or the managed instance group is specified,
|
746
|
+
# skip the placement policy creation.
|
747
|
+
# If placement policy is specified together with managed instance group,
|
748
|
+
# it will cause the following error:
|
749
|
+
# Reason: [{'code': 'UNSUPPORTED_OPERATION',
|
750
|
+
# 'message': 'Creating queued resource with
|
751
|
+
# resource policies is not supported.'}]
|
752
|
+
mig_configuration = config.provider_config.get('use_managed_instance_group',
|
753
|
+
False)
|
754
|
+
if (group_placement_policy is None or group_placement_policy.lower() !=
|
755
|
+
constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
|
756
|
+
return config
|
757
|
+
|
758
|
+
cluster_prefix = cluster_name[:constants.CLUSTER_PREFIX_LENGTH]
|
759
|
+
policy_name = f'{cluster_prefix}-placement-policy'
|
760
|
+
resource_policy = {
|
761
|
+
'name': policy_name,
|
762
|
+
'groupPlacementPolicy': {
|
763
|
+
'collocation': constants.COLLOCATED_COLLOCATION,
|
764
|
+
}
|
765
|
+
}
|
766
|
+
# Try to get the placement policy first, if not found, create it
|
767
|
+
placement_policy = _get_placement_policy(project_id, region, compute,
|
768
|
+
policy_name)
|
769
|
+
if not placement_policy:
|
770
|
+
logger.info(f'Creating placement policy {policy_name}'
|
771
|
+
f' for cluster {cluster_name}')
|
772
|
+
_create_placement_policy(project_id, region, compute, resource_policy)
|
773
|
+
node_config['resourcePolicies'] = [policy_name]
|
774
|
+
return config
|
775
|
+
|
776
|
+
|
663
777
|
def _configure_subnet(region: str, cluster_name: str,
|
664
778
|
config: common.ProvisionConfig, compute):
|
665
779
|
"""Pick a reasonable subnet if not specified by the config."""
|
@@ -671,25 +785,54 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
671
785
|
if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
|
672
786
|
return config
|
673
787
|
|
674
|
-
|
675
|
-
|
676
|
-
compute)
|
677
|
-
|
678
|
-
default_interfaces = [{
|
679
|
-
'subnetwork': default_subnet['selfLink'],
|
680
|
-
'accessConfigs': [{
|
681
|
-
'name': 'External NAT',
|
682
|
-
'type': 'ONE_TO_ONE_NAT',
|
683
|
-
}]
|
684
|
-
}]
|
685
|
-
# Add gVNIC if specified in config
|
788
|
+
default_interfaces = []
|
789
|
+
enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
|
686
790
|
enable_gvnic = config.provider_config.get('enable_gvnic', False)
|
687
|
-
if
|
688
|
-
|
791
|
+
if enable_gpu_direct:
|
792
|
+
if not enable_gvnic:
|
793
|
+
logger.warning(
|
794
|
+
'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
|
795
|
+
)
|
796
|
+
config.provider_config['enable_gvnic'] = True
|
797
|
+
enable_gvnic = True
|
798
|
+
if 'machineType' not in node_config or node_config[
|
799
|
+
'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
|
800
|
+
raise ValueError(
|
801
|
+
'Enable GPU Direct requires machineType to be one of '
|
802
|
+
f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
|
803
|
+
logger.info(f'Enable GPU Direct for cluster {cluster_name} '
|
804
|
+
f'with machineType {node_config["machineType"]}')
|
805
|
+
vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
|
806
|
+
cluster_name, region, config, compute)
|
807
|
+
for _, subnet in vpc_subnet_pairs:
|
808
|
+
default_interfaces.append({
|
809
|
+
'subnetwork': subnet['selfLink'],
|
810
|
+
'accessConfigs': [{
|
811
|
+
'name': 'External NAT',
|
812
|
+
'type': 'ONE_TO_ONE_NAT',
|
813
|
+
}],
|
814
|
+
'nicType': 'gVNIC'
|
815
|
+
})
|
816
|
+
else:
|
817
|
+
# SkyPilot: make sure there's a usable VPC
|
818
|
+
_, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
|
819
|
+
config, compute)
|
820
|
+
|
821
|
+
default_interfaces = [{
|
822
|
+
'subnetwork': default_subnet['selfLink'],
|
823
|
+
'accessConfigs': [{
|
824
|
+
'name': 'External NAT',
|
825
|
+
'type': 'ONE_TO_ONE_NAT',
|
826
|
+
}]
|
827
|
+
}]
|
828
|
+
# Add gVNIC if specified in config
|
829
|
+
if enable_gvnic:
|
830
|
+
default_interfaces[0]['nicType'] = 'gVNIC'
|
689
831
|
enable_external_ips = _enable_external_ips(config)
|
690
832
|
if not enable_external_ips:
|
691
833
|
# Removing this key means the VM will not be assigned an external IP.
|
692
|
-
default_interfaces
|
834
|
+
for interface in default_interfaces:
|
835
|
+
interface.pop('accessConfigs')
|
693
836
|
|
694
837
|
# The not applicable key will be removed during node creation
|
695
838
|
|
@@ -840,3 +983,42 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
|
|
840
983
|
).execute())
|
841
984
|
|
842
985
|
return result
|
986
|
+
|
987
|
+
|
988
|
+
def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
|
989
|
+
subnet_name: str, ip_cidr_range: str):
|
990
|
+
body = {
|
991
|
+
'name': subnet_name,
|
992
|
+
'ipCidrRange': ip_cidr_range,
|
993
|
+
'network': f'projects/{project_id}/global/networks/{vpc_name}',
|
994
|
+
'region': region,
|
995
|
+
}
|
996
|
+
operation = compute.subnetworks().insert(project=project_id,
|
997
|
+
region=region,
|
998
|
+
body=body).execute()
|
999
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
1000
|
+
compute)
|
1001
|
+
return response
|
1002
|
+
|
1003
|
+
|
1004
|
+
def _create_placement_policy(project_id: str, region: str, compute,
|
1005
|
+
placement_policy: dict):
|
1006
|
+
operation = compute.resourcePolicies().insert(
|
1007
|
+
project=project_id, region=region, body=placement_policy).execute()
|
1008
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
1009
|
+
compute)
|
1010
|
+
return response
|
1011
|
+
|
1012
|
+
|
1013
|
+
def _get_placement_policy(project_id: str, region: str, compute, name: str):
|
1014
|
+
try:
|
1015
|
+
placement_policy = (compute.resourcePolicies().get(
|
1016
|
+
project=project_id,
|
1017
|
+
region=region,
|
1018
|
+
resourcePolicy=name,
|
1019
|
+
).execute())
|
1020
|
+
except gcp.http_error_exception() as e:
|
1021
|
+
if e.resp.status == 404:
|
1022
|
+
return None
|
1023
|
+
raise
|
1024
|
+
return placement_policy
|
sky/provision/gcp/constants.py
CHANGED
@@ -41,6 +41,70 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
|
|
41
41
|
# with ServiceAccounts.
|
42
42
|
|
43
43
|
SKYPILOT_VPC_NAME = 'skypilot-vpc'
|
44
|
+
SKYPILOT_GPU_DIRECT_VPC_NUM = 5
|
45
|
+
SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
|
46
|
+
GPU_DIRECT_TCPX_INSTANCE_TYPES = [
|
47
|
+
'a3-edgegpu-8g',
|
48
|
+
'a3-highgpu-8g',
|
49
|
+
]
|
50
|
+
# The prefix length of the cluster name.
|
51
|
+
# To make sure the VPC and subnet names are within the GCP limits.
|
52
|
+
CLUSTER_PREFIX_LENGTH = 10
|
53
|
+
|
54
|
+
COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
|
55
|
+
COLLOCATED_COLLOCATION = 'COLLOCATED'
|
56
|
+
GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
|
57
|
+
set -e
|
58
|
+
set -x
|
59
|
+
# Install GPU Direct TCPX
|
60
|
+
cos-extensions install gpu -- --version=latest;
|
61
|
+
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
|
62
|
+
sudo mount -o remount,exec /var/lib/nvidia;
|
63
|
+
docker ps -a | grep -q receive-datapath-manager || \
|
64
|
+
docker run \
|
65
|
+
--detach \
|
66
|
+
--pull=always \
|
67
|
+
--name receive-datapath-manager \
|
68
|
+
--privileged \
|
69
|
+
--cap-add=NET_ADMIN --network=host \
|
70
|
+
--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
|
71
|
+
--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
|
72
|
+
--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
|
73
|
+
--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
|
74
|
+
--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
|
75
|
+
--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
|
76
|
+
--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
|
77
|
+
--volume /run/tcpx:/run/tcpx \
|
78
|
+
--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
|
79
|
+
us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
|
80
|
+
--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
|
81
|
+
sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
|
82
|
+
docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
|
83
|
+
sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
|
84
|
+
sudo mount -o remount,exec /var/lib/tcpx;
|
85
|
+
echo "GPU Direct TCPX installed"
|
86
|
+
"""
|
87
|
+
|
88
|
+
GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
89
|
+
'--cap-add=IPC_LOCK',
|
90
|
+
'--userns=host',
|
91
|
+
'--volume /run/tcpx:/run/tcpx',
|
92
|
+
'--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
|
93
|
+
'--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
|
94
|
+
'--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
|
95
|
+
'--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
|
96
|
+
'--device /dev/nvidia0:/dev/nvidia0',
|
97
|
+
'--device /dev/nvidia1:/dev/nvidia1',
|
98
|
+
'--device /dev/nvidia2:/dev/nvidia2',
|
99
|
+
'--device /dev/nvidia3:/dev/nvidia3',
|
100
|
+
'--device /dev/nvidia4:/dev/nvidia4',
|
101
|
+
'--device /dev/nvidia5:/dev/nvidia5',
|
102
|
+
'--device /dev/nvidia6:/dev/nvidia6',
|
103
|
+
'--device /dev/nvidia7:/dev/nvidia7',
|
104
|
+
'--device /dev/nvidia-uvm:/dev/nvidia-uvm',
|
105
|
+
'--device /dev/nvidiactl:/dev/nvidiactl',
|
106
|
+
'--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
|
107
|
+
]
|
44
108
|
|
45
109
|
# Below parameters are from the default VPC on GCP.
|
46
110
|
# https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
|
sky/provision/gcp/instance.py
CHANGED
@@ -530,9 +530,11 @@ def terminate_instances(
|
|
530
530
|
use_mig = provider_config.get('use_managed_instance_group', False)
|
531
531
|
if use_mig:
|
532
532
|
# Deleting the MIG will also delete the instances.
|
533
|
-
|
534
|
-
|
535
|
-
|
533
|
+
mig_exists_and_deleted = (
|
534
|
+
instance_utils.GCPManagedInstanceGroup.delete_mig(
|
535
|
+
project_id, zone, cluster_name_on_cloud))
|
536
|
+
if mig_exists_and_deleted:
|
537
|
+
return
|
536
538
|
|
537
539
|
label_filters = {
|
538
540
|
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
@@ -1125,12 +1125,14 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
|
|
1125
1125
|
if re.search(mig_utils.IT_RESOURCE_NOT_FOUND_PATTERN,
|
1126
1126
|
str(e)) is None:
|
1127
1127
|
raise
|
1128
|
-
logger.
|
1128
|
+
logger.debug(
|
1129
1129
|
f'Instance template {instance_template_name!r} does not exist. '
|
1130
1130
|
'Skip deletion.')
|
1131
1131
|
|
1132
1132
|
@classmethod
|
1133
|
-
def delete_mig(cls, project_id: str, zone: str, cluster_name: str) ->
|
1133
|
+
def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> bool:
|
1134
|
+
"""Returns whether the MIG is deleted successfully."""
|
1135
|
+
mig_exists_and_deleted = True
|
1134
1136
|
mig_name = mig_utils.get_managed_instance_group_name(cluster_name)
|
1135
1137
|
# Get all resize request of the MIG and cancel them.
|
1136
1138
|
mig_utils.cancel_all_resize_request_for_mig(project_id, zone, mig_name)
|
@@ -1144,8 +1146,9 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
|
|
1144
1146
|
if re.search(mig_utils.MIG_RESOURCE_NOT_FOUND_PATTERN,
|
1145
1147
|
str(e)) is None:
|
1146
1148
|
raise
|
1147
|
-
logger.
|
1148
|
-
|
1149
|
+
logger.debug(f'MIG {mig_name!r} does not exist. Skip '
|
1150
|
+
'deletion.')
|
1151
|
+
mig_exists_and_deleted = False
|
1149
1152
|
|
1150
1153
|
# In the autostop case, the following deletion of instance template
|
1151
1154
|
# will not be executed as the instance that runs the deletion will be
|
@@ -1156,6 +1159,7 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
|
|
1156
1159
|
cls._delete_instance_template(
|
1157
1160
|
project_id, zone,
|
1158
1161
|
mig_utils.get_instance_template_name(cluster_name))
|
1162
|
+
return mig_exists_and_deleted
|
1159
1163
|
|
1160
1164
|
@classmethod
|
1161
1165
|
def _add_labels_and_find_head(
|
sky/provision/nebius/instance.py
CHANGED
@@ -132,7 +132,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
132
132
|
region=region,
|
133
133
|
image_family=config.node_config['ImageId'],
|
134
134
|
disk_size=config.node_config['DiskSize'],
|
135
|
-
user_data=config.node_config['UserData']
|
135
|
+
user_data=config.node_config['UserData'],
|
136
|
+
associate_public_ip_address=(
|
137
|
+
not config.provider_config['use_internal_ips']))
|
136
138
|
except Exception as e: # pylint: disable=broad-except
|
137
139
|
logger.warning(f'run_instances error: {e}')
|
138
140
|
raise
|
sky/provision/nebius/utils.py
CHANGED
@@ -158,7 +158,7 @@ def start(instance_id: str) -> None:
|
|
158
158
|
|
159
159
|
def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
160
160
|
preset: str, region: str, image_family: str, disk_size: int,
|
161
|
-
user_data: str) -> str:
|
161
|
+
user_data: str, associate_public_ip_address: bool) -> str:
|
162
162
|
# Each node must have a unique name to avoid conflicts between
|
163
163
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
164
164
|
# to the node name.
|
@@ -242,7 +242,9 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
242
242
|
subnet_id=sub_net.items[0].metadata.id,
|
243
243
|
ip_address=nebius.compute().IPAddress(),
|
244
244
|
name='network-interface-0',
|
245
|
-
public_ip_address=nebius.compute().PublicIPAddress()
|
245
|
+
public_ip_address=nebius.compute().PublicIPAddress()
|
246
|
+
if associate_public_ip_address else None,
|
247
|
+
)
|
246
248
|
]))).wait()
|
247
249
|
instance_id = ''
|
248
250
|
retry_count = 0
|