skypilot-nightly 1.0.0.dev20250510__py3-none-any.whl → 1.0.0.dev20250513__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +3 -0
- sky/backends/cloud_vm_ray_backend.py +7 -0
- sky/cli.py +109 -109
- sky/client/cli.py +109 -109
- sky/clouds/gcp.py +35 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{C0fkLhvxyqkymoV7IeInQ → 2dkponv64SfFShA8Rnw0D}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/global_user_state.py +2 -0
- sky/provision/docker_utils.py +4 -1
- sky/provision/gcp/config.py +197 -15
- sky/provision/gcp/constants.py +64 -0
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +4 -2
- sky/server/requests/executor.py +114 -22
- sky/server/requests/requests.py +15 -0
- sky/server/server.py +12 -7
- sky/server/uvicorn.py +12 -2
- sky/sky_logging.py +40 -2
- sky/skylet/constants.py +3 -0
- sky/skylet/log_lib.py +51 -11
- sky/templates/gcp-ray.yml.j2 +11 -0
- sky/templates/nebius-ray.yml.j2 +4 -0
- sky/templates/websocket_proxy.py +29 -9
- sky/utils/command_runner.py +3 -0
- sky/utils/context.py +264 -0
- sky/utils/context_utils.py +172 -0
- sky/utils/rich_utils.py +81 -37
- sky/utils/schemas.py +9 -1
- sky/utils/subprocess_utils.py +8 -2
- {skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/RECORD +44 -42
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- /sky/dashboard/out/_next/static/{C0fkLhvxyqkymoV7IeInQ → 2dkponv64SfFShA8Rnw0D}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/top_level.txt +0 -0
sky/dashboard/out/jobs.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="preload" href="/dashboard/skypilot.svg" as="image" fetchpriority="high"/><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-0f8017370869e269.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-f49500b82ad5392d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js" defer=""></script><script src="/dashboard/_next/static/C0fkLhvxyqkymoV7IeInQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/C0fkLhvxyqkymoV7IeInQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div class="min-h-screen bg-gray-50"><div class="fixed top-0 left-0 right-0 z-50 shadow-sm"><div class="fixed top-0 left-0 right-0 bg-white z-30 h-14 px-4 border-b border-gray-200 shadow-sm"><div class="flex items-center h-full"><div class="flex items-center space-x-4 mr-6"><a class="flex items-center px-1 pt-1 h-full" href="/dashboard"><div class="h-20 w-20 flex items-center justify-center"><img alt="SkyPilot Logo" fetchpriority="high" width="80" height="80" decoding="async" data-nimg="1" class="w-full h-full object-contain" style="color:transparent" src="/dashboard/skypilot.svg"/></div></a></div><div class="flex items-center space-x-2 md:space-x-6 mr-6"><a class="inline-flex items-center border-b-2 border-transparent hover:text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/clusters"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="20" height="8" x="2" y="2" rx="2" ry="2"></rect><rect width="20" height="8" x="2" y="14" rx="2" ry="2"></rect><line x1="6" x2="6.01" y1="6" y2="6"></line><line x1="6" x2="6.01" y1="18" y2="18"></line></svg><span>Clusters</span></a><a class="inline-flex items-center border-b-2 border-transparent text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/jobs"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 20V4a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path><rect width="20" height="14" x="2" y="6" rx="2"></rect></svg><span>Jobs</span></a><div class="inline-flex items-center px-1 pt-1 text-gray-400"><svg class="w-4 h-4" viewBox="0 0 423.683 423.683" width="24" height="24" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" fill="currentColor" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><g id="SVGRepo_bgCarrier" stroke-width="0"></g><g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"></g><g id="SVGRepo_iconCarrier"><g><path d="M54.376,287.577h310.459c26.48,0,48.02-13.979,48.02-40.453c0-17.916-10.001-34.07-25.559-42.292 c-19.021-72.951-86.061-125.196-162.002-125.223v-3.431h-3.854V61.814h3.854v-9.569h-31.38v9.569h3.854v14.363h-3.854v3.431 c-75.941,0.026-142.97,52.272-161.988,125.217c-15.56,8.216-25.573,24.376-25.573,42.291 C6.36,273.597,27.896,287.577,54.376,287.577z M47.676,227.145l7.214-2.424l1.617-7.447 c13.884-64.232,71.707-110.862,137.467-110.862h31.274c65.763,0,123.582,46.63,137.473,110.862l1.607,7.447l7.223,2.424 c8.678,2.92,14.506,10.946,14.506,19.979c0,11.703-9.517,13.647-21.221,13.647H54.376c-11.7,0-21.22-1.944-21.22-13.647 C33.162,238.091,38.984,230.065,47.676,227.145z M423.683,334.602v36.836H0v-36.836h25.348v-18.418h372.99v18.418H423.683z"></path></g></g></svg><span class="ml-2">Services</span><span class="text-xs ml-2 px-1.5 py-0.5 bg-gray-100 text-gray-500 rounded">Soon</span></div></div><div class="flex items-center space-x-1 ml-auto"><a href="https://skypilot.readthedocs.io/en/latest/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center px-2 py-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer" title="Docs"><span class="mr-1">Docs</span><svg class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"></path><polyline points="15 3 21 3 21 9"></polyline><line x1="10" y1="14" x2="21" y2="3"></line></svg></a><div class="border-l border-gray-200 h-6 mx-1"></div><a href="https://github.com/skypilot-org/skypilot" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="GitHub"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"></path></svg></a><a href="https://slack.skypilot.co/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Slack"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path transform="scale(0.85) translate(1.8, 1.8)" d="M5.042 15.165a2.528 2.528 0 0 1-2.52 2.523A2.528 2.528 0 0 1 0 15.165a2.527 2.527 0 0 1 2.522-2.52h2.52v2.52zM6.313 15.165a2.527 2.527 0 0 1 2.521-2.52 2.527 2.527 0 0 1 2.521 2.52v6.313A2.528 2.528 0 0 1 8.834 24a2.528 2.528 0 0 1-2.521-2.522v-6.313zM8.834 5.042a2.528 2.528 0 0 1-2.521-2.52A2.528 2.528 0 0 1 8.834 0a2.528 2.528 0 0 1 2.521 2.522v2.52H8.834zM8.834 6.313a2.528 2.528 0 0 1 2.521 2.521 2.528 2.528 0 0 1-2.521 2.521H2.522A2.528 2.528 0 0 1 0 8.834a2.528 2.528 0 0 1 2.522-2.521h6.312zM18.956 8.834a2.528 2.528 0 0 1 2.522-2.521A2.528 2.528 0 0 1 24 8.834a2.528 2.528 0 0 1-2.522 2.521h-2.522V8.834zM17.688 8.834a2.528 2.528 0 0 1-2.523 2.521 2.527 2.527 0 0 1-2.52-2.521V2.522A2.527 2.527 0 0 1 15.165 0a2.528 2.528 0 0 1 2.523 2.522v6.312zM15.165 18.956a2.528 2.528 0 0 1 2.523 2.522A2.528 2.528 0 0 1 15.165 24a2.527 2.527 0 0 1-2.52-2.522v-2.522h2.52zM15.165 17.688a2.527 2.527 0 0 1-2.52-2.523 2.526 2.526 0 0 1 2.52-2.52h6.313A2.527 2.527 0 0 1 24 15.165a2.528 2.528 0 0 1-2.522 2.523h-6.313z"></path></svg></a><a href="https://github.com/skypilot-org/skypilot/issues/new" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Leave Feedback"><svg class="w-5 h-5" stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g><path fill="none" d="M0 0h24v24H0z"></path><path d="M6.455 19L2 22.5V4a1 1 0 0 1 1-1h18a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H6.455zM4 18.385L5.763 17H20V5H4v13.385zM11 13h2v2h-2v-2zm0-6h2v5h-2V7z"></path></g></svg></a></div></div></div></div><div class="transition-all duration-200 ease-in-out min-h-screen" style="padding-top:56px"><main class="p-6"><div class="flex items-center justify-between mb-4 h-5"><div class="text-base"><a class="text-sky-blue hover:underline leading-none" href="/dashboard/jobs">Managed Jobs</a></div><div class="flex items-center space-x-2"><button class="inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 hover:bg-accent h-9 rounded-md px-3 text-sky-blue hover:text-sky-blue-bright" title="Refresh"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-rotate-cw h-4 w-4 mr-1.5"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg><span>Refresh</span></button></div></div><div class="relative"><div class="flex flex-col space-y-1 mb-1"><div class="flex flex-wrap items-center text-sm mb-1"><span class="mr-2 text-sm font-medium">Statuses:</span><div class="flex flex-wrap gap-2 items-center"></div></div></div><div class="rounded-lg border bg-card text-card-foreground shadow-sm"><div class="relative w-full overflow-auto"><table class="w-full caption-bottom text-base"><thead class="[&_tr]:border-b"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">ID</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Name</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">User</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Submitted</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Duration</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Status</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Resources</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Cluster</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Region</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Recoveries</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Details</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Logs</th></tr></thead><tbody class="[&_tr:last-child]:border-0"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><td class="p-4 align-middle [&:has([role=checkbox])]:pr-0 text-center py-6" colSpan="12"><div class="flex flex-col items-center space-y-4"><p class="text-gray-500">No active jobs</p></div></td></tr></tbody></table></div></div></div></main></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"C0fkLhvxyqkymoV7IeInQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="preload" href="/dashboard/skypilot.svg" as="image" fetchpriority="high"/><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-0ca6f2c1ba667c3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-f49500b82ad5392d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js" defer=""></script><script src="/dashboard/_next/static/2dkponv64SfFShA8Rnw0D/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2dkponv64SfFShA8Rnw0D/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div class="min-h-screen bg-gray-50"><div class="fixed top-0 left-0 right-0 z-50 shadow-sm"><div class="fixed top-0 left-0 right-0 bg-white z-30 h-14 px-4 border-b border-gray-200 shadow-sm"><div class="flex items-center h-full"><div class="flex items-center space-x-4 mr-6"><a class="flex items-center px-1 pt-1 h-full" href="/dashboard"><div class="h-20 w-20 flex items-center justify-center"><img alt="SkyPilot Logo" fetchpriority="high" width="80" height="80" decoding="async" data-nimg="1" class="w-full h-full object-contain" style="color:transparent" src="/dashboard/skypilot.svg"/></div></a></div><div class="flex items-center space-x-2 md:space-x-6 mr-6"><a class="inline-flex items-center border-b-2 border-transparent hover:text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/clusters"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="20" height="8" x="2" y="2" rx="2" ry="2"></rect><rect width="20" height="8" x="2" y="14" rx="2" ry="2"></rect><line x1="6" x2="6.01" y1="6" y2="6"></line><line x1="6" x2="6.01" y1="18" y2="18"></line></svg><span>Clusters</span></a><a class="inline-flex items-center border-b-2 border-transparent text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/jobs"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 20V4a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path><rect width="20" height="14" x="2" y="6" rx="2"></rect></svg><span>Jobs</span></a><div class="inline-flex items-center px-1 pt-1 text-gray-400"><svg class="w-4 h-4" viewBox="0 0 423.683 423.683" width="24" height="24" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" fill="currentColor" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><g id="SVGRepo_bgCarrier" stroke-width="0"></g><g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"></g><g id="SVGRepo_iconCarrier"><g><path d="M54.376,287.577h310.459c26.48,0,48.02-13.979,48.02-40.453c0-17.916-10.001-34.07-25.559-42.292 c-19.021-72.951-86.061-125.196-162.002-125.223v-3.431h-3.854V61.814h3.854v-9.569h-31.38v9.569h3.854v14.363h-3.854v3.431 c-75.941,0.026-142.97,52.272-161.988,125.217c-15.56,8.216-25.573,24.376-25.573,42.291 C6.36,273.597,27.896,287.577,54.376,287.577z M47.676,227.145l7.214-2.424l1.617-7.447 c13.884-64.232,71.707-110.862,137.467-110.862h31.274c65.763,0,123.582,46.63,137.473,110.862l1.607,7.447l7.223,2.424 c8.678,2.92,14.506,10.946,14.506,19.979c0,11.703-9.517,13.647-21.221,13.647H54.376c-11.7,0-21.22-1.944-21.22-13.647 C33.162,238.091,38.984,230.065,47.676,227.145z M423.683,334.602v36.836H0v-36.836h25.348v-18.418h372.99v18.418H423.683z"></path></g></g></svg><span class="ml-2">Services</span><span class="text-xs ml-2 px-1.5 py-0.5 bg-gray-100 text-gray-500 rounded">Soon</span></div></div><div class="flex items-center space-x-1 ml-auto"><a href="https://skypilot.readthedocs.io/en/latest/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center px-2 py-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer" title="Docs"><span class="mr-1">Docs</span><svg class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"></path><polyline points="15 3 21 3 21 9"></polyline><line x1="10" y1="14" x2="21" y2="3"></line></svg></a><div class="border-l border-gray-200 h-6 mx-1"></div><a href="https://github.com/skypilot-org/skypilot" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="GitHub"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"></path></svg></a><a href="https://slack.skypilot.co/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Slack"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path transform="scale(0.85) translate(1.8, 1.8)" d="M5.042 15.165a2.528 2.528 0 0 1-2.52 2.523A2.528 2.528 0 0 1 0 15.165a2.527 2.527 0 0 1 2.522-2.52h2.52v2.52zM6.313 15.165a2.527 2.527 0 0 1 2.521-2.52 2.527 2.527 0 0 1 2.521 2.52v6.313A2.528 2.528 0 0 1 8.834 24a2.528 2.528 0 0 1-2.521-2.522v-6.313zM8.834 5.042a2.528 2.528 0 0 1-2.521-2.52A2.528 2.528 0 0 1 8.834 0a2.528 2.528 0 0 1 2.521 2.522v2.52H8.834zM8.834 6.313a2.528 2.528 0 0 1 2.521 2.521 2.528 2.528 0 0 1-2.521 2.521H2.522A2.528 2.528 0 0 1 0 8.834a2.528 2.528 0 0 1 2.522-2.521h6.312zM18.956 8.834a2.528 2.528 0 0 1 2.522-2.521A2.528 2.528 0 0 1 24 8.834a2.528 2.528 0 0 1-2.522 2.521h-2.522V8.834zM17.688 8.834a2.528 2.528 0 0 1-2.523 2.521 2.527 2.527 0 0 1-2.52-2.521V2.522A2.527 2.527 0 0 1 15.165 0a2.528 2.528 0 0 1 2.523 2.522v6.312zM15.165 18.956a2.528 2.528 0 0 1 2.523 2.522A2.528 2.528 0 0 1 15.165 24a2.527 2.527 0 0 1-2.52-2.522v-2.522h2.52zM15.165 17.688a2.527 2.527 0 0 1-2.52-2.523 2.526 2.526 0 0 1 2.52-2.52h6.313A2.527 2.527 0 0 1 24 15.165a2.528 2.528 0 0 1-2.522 2.523h-6.313z"></path></svg></a><a href="https://github.com/skypilot-org/skypilot/issues/new" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Leave Feedback"><svg class="w-5 h-5" stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g><path fill="none" d="M0 0h24v24H0z"></path><path d="M6.455 19L2 22.5V4a1 1 0 0 1 1-1h18a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H6.455zM4 18.385L5.763 17H20V5H4v13.385zM11 13h2v2h-2v-2zm0-6h2v5h-2V7z"></path></g></svg></a></div></div></div></div><div class="transition-all duration-200 ease-in-out min-h-screen" style="padding-top:56px"><main class="p-6"><div class="flex items-center justify-between mb-4 h-5"><div class="text-base"><a class="text-sky-blue hover:underline leading-none" href="/dashboard/jobs">Managed Jobs</a></div><div class="flex items-center space-x-2"><button class="inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 hover:bg-accent h-9 rounded-md px-3 text-sky-blue hover:text-sky-blue-bright" title="Refresh"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-rotate-cw h-4 w-4 mr-1.5"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg><span>Refresh</span></button></div></div><div class="relative"><div class="flex flex-col space-y-1 mb-1"><div class="flex flex-wrap items-center text-sm mb-1"><span class="mr-2 text-sm font-medium">Statuses:</span><div class="flex flex-wrap gap-2 items-center"></div></div></div><div class="rounded-lg border bg-card text-card-foreground shadow-sm"><div class="relative w-full overflow-auto"><table class="w-full caption-bottom text-base"><thead class="[&_tr]:border-b"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">ID</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Name</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">User</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Submitted</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Duration</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Status</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Resources</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Cluster</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Region</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Recoveries</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Details</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&:has([role=checkbox])]:pr-0">Logs</th></tr></thead><tbody class="[&_tr:last-child]:border-0"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><td class="p-4 align-middle [&:has([role=checkbox])]:pr-0 text-center py-6" colSpan="12"><div class="flex flex-col items-center space-y-4"><p class="text-gray-500">No active jobs</p></div></td></tr></tbody></table></div></div></div></main></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"2dkponv64SfFShA8Rnw0D","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
@@ -19,6 +19,7 @@ import uuid
|
|
19
19
|
from sky import models
|
20
20
|
from sky import sky_logging
|
21
21
|
from sky.utils import common_utils
|
22
|
+
from sky.utils import context_utils
|
22
23
|
from sky.utils import db_utils
|
23
24
|
from sky.utils import registry
|
24
25
|
from sky.utils import status_lib
|
@@ -671,6 +672,7 @@ def _load_storage_mounts_metadata(
|
|
671
672
|
return pickle.loads(record_storage_mounts_metadata)
|
672
673
|
|
673
674
|
|
675
|
+
@context_utils.cancellation_guard
|
674
676
|
def get_cluster_from_name(
|
675
677
|
cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
|
676
678
|
rows = _DB.cursor.execute(
|
sky/provision/docker_utils.py
CHANGED
@@ -343,9 +343,12 @@ class DockerInitializer:
|
|
343
343
|
# `mesg: ttyname failed: inappropriate ioctl for device`.
|
344
344
|
# see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
345
345
|
port = constants.DEFAULT_DOCKER_PORT
|
346
|
+
# In case the port is already configured in the sshd_config file
|
347
|
+
# in some images, we delete it first and then append the new one.
|
346
348
|
# pylint: disable=anomalous-backslash-in-string
|
347
349
|
self._run(
|
348
|
-
|
350
|
+
'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
|
351
|
+
f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
|
349
352
|
'mkdir -p ~/.ssh;'
|
350
353
|
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
|
351
354
|
'sudo service ssh start;'
|
sky/provision/gcp/config.py
CHANGED
@@ -75,6 +75,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
|
|
75
75
|
return result
|
76
76
|
|
77
77
|
|
78
|
+
def wait_for_compute_region_operation(project_name, region, operation, compute):
|
79
|
+
"""Poll for region compute operation until finished."""
|
80
|
+
logger.info('wait_for_compute_region_operation: '
|
81
|
+
'Waiting for operation {} to finish...'.format(
|
82
|
+
operation['name']))
|
83
|
+
|
84
|
+
for _ in range(constants.MAX_POLLS):
|
85
|
+
result = (compute.regionOperations().get(
|
86
|
+
project=project_name,
|
87
|
+
region=region,
|
88
|
+
operation=operation['name'],
|
89
|
+
).execute())
|
90
|
+
if 'error' in result:
|
91
|
+
raise Exception(result['error'])
|
92
|
+
|
93
|
+
if result['status'] == 'DONE':
|
94
|
+
logger.info('wait_for_compute_region_operation: Operation done.')
|
95
|
+
break
|
96
|
+
|
97
|
+
time.sleep(constants.POLL_INTERVAL)
|
98
|
+
|
99
|
+
return result
|
100
|
+
|
101
|
+
|
78
102
|
def _create_crm(gcp_credentials=None):
|
79
103
|
return gcp.build('cloudresourcemanager',
|
80
104
|
'v1',
|
@@ -168,6 +192,7 @@ def bootstrap_instances(
|
|
168
192
|
iam_role = _configure_iam_role(config, crm, iam)
|
169
193
|
config.node_config.update(iam_role)
|
170
194
|
config = _configure_subnet(region, cluster_name, config, compute)
|
195
|
+
config = _configure_placement_policy(region, cluster_name, config, compute)
|
171
196
|
|
172
197
|
return config
|
173
198
|
|
@@ -660,6 +685,95 @@ def get_usable_vpc_and_subnet(
|
|
660
685
|
return usable_vpc_name, usable_subnet
|
661
686
|
|
662
687
|
|
688
|
+
def get_gpu_direct_usable_vpcs_and_subnets(
|
689
|
+
cluster_name: str,
|
690
|
+
region: str,
|
691
|
+
config: common.ProvisionConfig,
|
692
|
+
compute,
|
693
|
+
) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
|
694
|
+
"""Return a list of usable VPCs and subnets for GPU Direct."""
|
695
|
+
project_id = config.provider_config['project_id']
|
696
|
+
vpc_prefix = constants.SKYPILOT
|
697
|
+
cluster_prefix = cluster_name[:constants.CLUSTER_PREFIX_LENGTH]
|
698
|
+
vpc_subnet_pairs = []
|
699
|
+
|
700
|
+
# TODO(hailong): Determine the num_vpcs per different GPU Direct types
|
701
|
+
num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
|
702
|
+
|
703
|
+
cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
|
704
|
+
for i in range(num_vpcs):
|
705
|
+
if i == 0:
|
706
|
+
vpc_name = f'{vpc_prefix}-{cluster_prefix}-mgmt-net'
|
707
|
+
else:
|
708
|
+
vpc_name = f'{vpc_prefix}-{cluster_prefix}-data-net-{i}'
|
709
|
+
subnet_name = f'{vpc_name}-sub'
|
710
|
+
subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
|
711
|
+
# Check if VPC exists
|
712
|
+
vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
|
713
|
+
if not vpc_list:
|
714
|
+
body = constants.VPC_TEMPLATE.copy()
|
715
|
+
body['mtu'] = 8244
|
716
|
+
body['autoCreateSubnetworks'] = False
|
717
|
+
body['name'] = vpc_name
|
718
|
+
body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
|
719
|
+
VPC_NAME=vpc_name)
|
720
|
+
_create_vpcnet(project_id, compute, body)
|
721
|
+
# Check if subnet exists
|
722
|
+
subnets = _list_subnets(project_id, region, compute, network=vpc_name)
|
723
|
+
if not subnets:
|
724
|
+
_create_subnet(project_id, region, compute, vpc_name, subnet_name,
|
725
|
+
subnet_cidr_range)
|
726
|
+
subnets = _list_subnets(project_id,
|
727
|
+
region,
|
728
|
+
compute,
|
729
|
+
network=vpc_name)
|
730
|
+
# Apply firewall rules
|
731
|
+
_create_rules(project_id, compute, constants.FIREWALL_RULES_TEMPLATE,
|
732
|
+
vpc_name)
|
733
|
+
vpc_subnet_pairs.append((vpc_name, subnets[0]))
|
734
|
+
return vpc_subnet_pairs
|
735
|
+
|
736
|
+
|
737
|
+
def _configure_placement_policy(region: str, cluster_name: str,
|
738
|
+
config: common.ProvisionConfig, compute):
|
739
|
+
"""Configure placement group for GPU Direct."""
|
740
|
+
node_config = config.node_config
|
741
|
+
project_id = config.provider_config['project_id']
|
742
|
+
group_placement_policy = config.provider_config.get('placement_policy',
|
743
|
+
None)
|
744
|
+
# If the placement policy is not compact,
|
745
|
+
# or the managed instance group is specified,
|
746
|
+
# skip the placement policy creation.
|
747
|
+
# If placement policy is specified together with managed instance group,
|
748
|
+
# it will cause the following error:
|
749
|
+
# Reason: [{'code': 'UNSUPPORTED_OPERATION',
|
750
|
+
# 'message': 'Creating queued resource with
|
751
|
+
# resource policies is not supported.'}]
|
752
|
+
mig_configuration = config.provider_config.get('use_managed_instance_group',
|
753
|
+
False)
|
754
|
+
if (group_placement_policy is None or group_placement_policy.lower() !=
|
755
|
+
constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
|
756
|
+
return config
|
757
|
+
|
758
|
+
cluster_prefix = cluster_name[:constants.CLUSTER_PREFIX_LENGTH]
|
759
|
+
policy_name = f'{cluster_prefix}-placement-policy'
|
760
|
+
resource_policy = {
|
761
|
+
'name': policy_name,
|
762
|
+
'groupPlacementPolicy': {
|
763
|
+
'collocation': constants.COLLOCATED_COLLOCATION,
|
764
|
+
}
|
765
|
+
}
|
766
|
+
# Try to get the placement policy first, if not found, create it
|
767
|
+
placement_policy = _get_placement_policy(project_id, region, compute,
|
768
|
+
policy_name)
|
769
|
+
if not placement_policy:
|
770
|
+
logger.info(f'Creating placement policy {policy_name}'
|
771
|
+
f' for cluster {cluster_name}')
|
772
|
+
_create_placement_policy(project_id, region, compute, resource_policy)
|
773
|
+
node_config['resourcePolicies'] = [policy_name]
|
774
|
+
return config
|
775
|
+
|
776
|
+
|
663
777
|
def _configure_subnet(region: str, cluster_name: str,
|
664
778
|
config: common.ProvisionConfig, compute):
|
665
779
|
"""Pick a reasonable subnet if not specified by the config."""
|
@@ -671,25 +785,54 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
671
785
|
if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
|
672
786
|
return config
|
673
787
|
|
674
|
-
|
675
|
-
|
676
|
-
compute)
|
677
|
-
|
678
|
-
default_interfaces = [{
|
679
|
-
'subnetwork': default_subnet['selfLink'],
|
680
|
-
'accessConfigs': [{
|
681
|
-
'name': 'External NAT',
|
682
|
-
'type': 'ONE_TO_ONE_NAT',
|
683
|
-
}]
|
684
|
-
}]
|
685
|
-
# Add gVNIC if specified in config
|
788
|
+
default_interfaces = []
|
789
|
+
enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
|
686
790
|
enable_gvnic = config.provider_config.get('enable_gvnic', False)
|
687
|
-
if
|
688
|
-
|
791
|
+
if enable_gpu_direct:
|
792
|
+
if not enable_gvnic:
|
793
|
+
logger.warning(
|
794
|
+
'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
|
795
|
+
)
|
796
|
+
config.provider_config['enable_gvnic'] = True
|
797
|
+
enable_gvnic = True
|
798
|
+
if 'machineType' not in node_config or node_config[
|
799
|
+
'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
|
800
|
+
raise ValueError(
|
801
|
+
'Enable GPU Direct requires machineType to be one of '
|
802
|
+
f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
|
803
|
+
logger.info(f'Enable GPU Direct for cluster {cluster_name} '
|
804
|
+
f'with machineType {node_config["machineType"]}')
|
805
|
+
vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
|
806
|
+
cluster_name, region, config, compute)
|
807
|
+
for _, subnet in vpc_subnet_pairs:
|
808
|
+
default_interfaces.append({
|
809
|
+
'subnetwork': subnet['selfLink'],
|
810
|
+
'accessConfigs': [{
|
811
|
+
'name': 'External NAT',
|
812
|
+
'type': 'ONE_TO_ONE_NAT',
|
813
|
+
}],
|
814
|
+
'nicType': 'gVNIC'
|
815
|
+
})
|
816
|
+
else:
|
817
|
+
# SkyPilot: make sure there's a usable VPC
|
818
|
+
_, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
|
819
|
+
config, compute)
|
820
|
+
|
821
|
+
default_interfaces = [{
|
822
|
+
'subnetwork': default_subnet['selfLink'],
|
823
|
+
'accessConfigs': [{
|
824
|
+
'name': 'External NAT',
|
825
|
+
'type': 'ONE_TO_ONE_NAT',
|
826
|
+
}]
|
827
|
+
}]
|
828
|
+
# Add gVNIC if specified in config
|
829
|
+
if enable_gvnic:
|
830
|
+
default_interfaces[0]['nicType'] = 'gVNIC'
|
689
831
|
enable_external_ips = _enable_external_ips(config)
|
690
832
|
if not enable_external_ips:
|
691
833
|
# Removing this key means the VM will not be assigned an external IP.
|
692
|
-
default_interfaces
|
834
|
+
for interface in default_interfaces:
|
835
|
+
interface.pop('accessConfigs')
|
693
836
|
|
694
837
|
# The not applicable key will be removed during node creation
|
695
838
|
|
@@ -840,3 +983,42 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
|
|
840
983
|
).execute())
|
841
984
|
|
842
985
|
return result
|
986
|
+
|
987
|
+
|
988
|
+
def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
|
989
|
+
subnet_name: str, ip_cidr_range: str):
|
990
|
+
body = {
|
991
|
+
'name': subnet_name,
|
992
|
+
'ipCidrRange': ip_cidr_range,
|
993
|
+
'network': f'projects/{project_id}/global/networks/{vpc_name}',
|
994
|
+
'region': region,
|
995
|
+
}
|
996
|
+
operation = compute.subnetworks().insert(project=project_id,
|
997
|
+
region=region,
|
998
|
+
body=body).execute()
|
999
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
1000
|
+
compute)
|
1001
|
+
return response
|
1002
|
+
|
1003
|
+
|
1004
|
+
def _create_placement_policy(project_id: str, region: str, compute,
|
1005
|
+
placement_policy: dict):
|
1006
|
+
operation = compute.resourcePolicies().insert(
|
1007
|
+
project=project_id, region=region, body=placement_policy).execute()
|
1008
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
1009
|
+
compute)
|
1010
|
+
return response
|
1011
|
+
|
1012
|
+
|
1013
|
+
def _get_placement_policy(project_id: str, region: str, compute, name: str):
|
1014
|
+
try:
|
1015
|
+
placement_policy = (compute.resourcePolicies().get(
|
1016
|
+
project=project_id,
|
1017
|
+
region=region,
|
1018
|
+
resourcePolicy=name,
|
1019
|
+
).execute())
|
1020
|
+
except gcp.http_error_exception() as e:
|
1021
|
+
if e.resp.status == 404:
|
1022
|
+
return None
|
1023
|
+
raise
|
1024
|
+
return placement_policy
|
sky/provision/gcp/constants.py
CHANGED
@@ -41,6 +41,70 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
|
|
41
41
|
# with ServiceAccounts.
|
42
42
|
|
43
43
|
SKYPILOT_VPC_NAME = 'skypilot-vpc'
|
44
|
+
SKYPILOT_GPU_DIRECT_VPC_NUM = 5
|
45
|
+
SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
|
46
|
+
GPU_DIRECT_TCPX_INSTANCE_TYPES = [
|
47
|
+
'a3-edgegpu-8g',
|
48
|
+
'a3-highgpu-8g',
|
49
|
+
]
|
50
|
+
# The prefix length of the cluster name.
|
51
|
+
# To make sure the VPC and subnet names are within the GCP limits.
|
52
|
+
CLUSTER_PREFIX_LENGTH = 10
|
53
|
+
|
54
|
+
COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
|
55
|
+
COLLOCATED_COLLOCATION = 'COLLOCATED'
|
56
|
+
GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
|
57
|
+
set -e
|
58
|
+
set -x
|
59
|
+
# Install GPU Direct TCPX
|
60
|
+
cos-extensions install gpu -- --version=latest;
|
61
|
+
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
|
62
|
+
sudo mount -o remount,exec /var/lib/nvidia;
|
63
|
+
docker ps -a | grep -q receive-datapath-manager || \
|
64
|
+
docker run \
|
65
|
+
--detach \
|
66
|
+
--pull=always \
|
67
|
+
--name receive-datapath-manager \
|
68
|
+
--privileged \
|
69
|
+
--cap-add=NET_ADMIN --network=host \
|
70
|
+
--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
|
71
|
+
--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
|
72
|
+
--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
|
73
|
+
--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
|
74
|
+
--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
|
75
|
+
--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
|
76
|
+
--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
|
77
|
+
--volume /run/tcpx:/run/tcpx \
|
78
|
+
--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
|
79
|
+
us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
|
80
|
+
--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
|
81
|
+
sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
|
82
|
+
docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
|
83
|
+
sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
|
84
|
+
sudo mount -o remount,exec /var/lib/tcpx;
|
85
|
+
echo "GPU Direct TCPX installed"
|
86
|
+
"""
|
87
|
+
|
88
|
+
GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
89
|
+
'--cap-add=IPC_LOCK',
|
90
|
+
'--userns=host',
|
91
|
+
'--volume /run/tcpx:/run/tcpx',
|
92
|
+
'--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
|
93
|
+
'--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
|
94
|
+
'--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
|
95
|
+
'--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
|
96
|
+
'--device /dev/nvidia0:/dev/nvidia0',
|
97
|
+
'--device /dev/nvidia1:/dev/nvidia1',
|
98
|
+
'--device /dev/nvidia2:/dev/nvidia2',
|
99
|
+
'--device /dev/nvidia3:/dev/nvidia3',
|
100
|
+
'--device /dev/nvidia4:/dev/nvidia4',
|
101
|
+
'--device /dev/nvidia5:/dev/nvidia5',
|
102
|
+
'--device /dev/nvidia6:/dev/nvidia6',
|
103
|
+
'--device /dev/nvidia7:/dev/nvidia7',
|
104
|
+
'--device /dev/nvidia-uvm:/dev/nvidia-uvm',
|
105
|
+
'--device /dev/nvidiactl:/dev/nvidiactl',
|
106
|
+
'--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
|
107
|
+
]
|
44
108
|
|
45
109
|
# Below parameters are from the default VPC on GCP.
|
46
110
|
# https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
|
sky/provision/nebius/instance.py
CHANGED
@@ -132,7 +132,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
132
132
|
region=region,
|
133
133
|
image_family=config.node_config['ImageId'],
|
134
134
|
disk_size=config.node_config['DiskSize'],
|
135
|
-
user_data=config.node_config['UserData']
|
135
|
+
user_data=config.node_config['UserData'],
|
136
|
+
associate_public_ip_address=(
|
137
|
+
not config.provider_config['use_internal_ips']))
|
136
138
|
except Exception as e: # pylint: disable=broad-except
|
137
139
|
logger.warning(f'run_instances error: {e}')
|
138
140
|
raise
|
sky/provision/nebius/utils.py
CHANGED
@@ -158,7 +158,7 @@ def start(instance_id: str) -> None:
|
|
158
158
|
|
159
159
|
def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
160
160
|
preset: str, region: str, image_family: str, disk_size: int,
|
161
|
-
user_data: str) -> str:
|
161
|
+
user_data: str, associate_public_ip_address: bool) -> str:
|
162
162
|
# Each node must have a unique name to avoid conflicts between
|
163
163
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
164
164
|
# to the node name.
|
@@ -242,7 +242,9 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
242
242
|
subnet_id=sub_net.items[0].metadata.id,
|
243
243
|
ip_address=nebius.compute().IPAddress(),
|
244
244
|
name='network-interface-0',
|
245
|
-
public_ip_address=nebius.compute().PublicIPAddress()
|
245
|
+
public_ip_address=nebius.compute().PublicIPAddress()
|
246
|
+
if associate_public_ip_address else None,
|
247
|
+
)
|
246
248
|
]))).wait()
|
247
249
|
instance_id = ''
|
248
250
|
retry_count = 0
|
sky/server/requests/executor.py
CHANGED
@@ -18,7 +18,10 @@ The number of the workers is determined by the system resources.
|
|
18
18
|
|
19
19
|
See the [README.md](../README.md) for detailed architecture of the executor.
|
20
20
|
"""
|
21
|
+
import asyncio
|
21
22
|
import contextlib
|
23
|
+
import contextvars
|
24
|
+
import functools
|
22
25
|
import multiprocessing
|
23
26
|
import os
|
24
27
|
import queue as queue_lib
|
@@ -47,6 +50,7 @@ from sky.server.requests.queues import mp_queue
|
|
47
50
|
from sky.skylet import constants
|
48
51
|
from sky.utils import annotations
|
49
52
|
from sky.utils import common_utils
|
53
|
+
from sky.utils import context
|
50
54
|
from sky.utils import subprocess_utils
|
51
55
|
from sky.utils import timeline
|
52
56
|
|
@@ -60,7 +64,6 @@ else:
|
|
60
64
|
from typing_extensions import ParamSpec
|
61
65
|
|
62
66
|
P = ParamSpec('P')
|
63
|
-
|
64
67
|
logger = sky_logging.init_logger(__name__)
|
65
68
|
|
66
69
|
# On macOS, the default start method for multiprocessing is 'fork', which
|
@@ -341,6 +344,114 @@ def _request_execution_wrapper(request_id: str,
|
|
341
344
|
logger.info(f'Request {request_id} finished')
|
342
345
|
|
343
346
|
|
347
|
+
async def execute_request_coroutine(request: api_requests.Request):
|
348
|
+
"""Execute a request in current event loop.
|
349
|
+
|
350
|
+
Similar to _request_execution_wrapper, but executed as coroutine in current
|
351
|
+
event loop. This is designed for executing tasks that are not CPU
|
352
|
+
intensive, e.g. sky logs.
|
353
|
+
"""
|
354
|
+
ctx = context.get()
|
355
|
+
if ctx is None:
|
356
|
+
raise ValueError('Context is not initialized')
|
357
|
+
logger.info(f'Executing request {request.request_id} in coroutine')
|
358
|
+
func = request.entrypoint
|
359
|
+
request_body = request.request_body
|
360
|
+
with api_requests.update_request(request.request_id) as request_task:
|
361
|
+
request_task.status = api_requests.RequestStatus.RUNNING
|
362
|
+
# Redirect stdout and stderr to the request log path.
|
363
|
+
original_output = ctx.redirect_log(request.log_path)
|
364
|
+
# Override environment variables that backs env_options.Options
|
365
|
+
# TODO(aylei): compared to process executor, running task in coroutine has
|
366
|
+
# two issues to fix:
|
367
|
+
# 1. skypilot config is not contextual
|
368
|
+
# 2. envs that read directly from os.environ are not contextual
|
369
|
+
ctx.override_envs(request_body.env_vars)
|
370
|
+
loop = asyncio.get_running_loop()
|
371
|
+
pyctx = contextvars.copy_context()
|
372
|
+
func_call = functools.partial(pyctx.run, func, **request_body.to_kwargs())
|
373
|
+
fut: asyncio.Future = loop.run_in_executor(None, func_call)
|
374
|
+
|
375
|
+
async def poll_task(request_id: str) -> bool:
|
376
|
+
request = api_requests.get_request(request_id)
|
377
|
+
if request is None:
|
378
|
+
raise RuntimeError('Request not found')
|
379
|
+
|
380
|
+
if request.status == api_requests.RequestStatus.CANCELLED:
|
381
|
+
ctx.cancel()
|
382
|
+
return True
|
383
|
+
|
384
|
+
if fut.done():
|
385
|
+
try:
|
386
|
+
result = await fut
|
387
|
+
api_requests.set_request_succeeded(request_id, result)
|
388
|
+
except asyncio.CancelledError:
|
389
|
+
# The task is cancelled by ctx.cancel(), where the status
|
390
|
+
# should already be set to CANCELLED.
|
391
|
+
pass
|
392
|
+
except Exception as e: # pylint: disable=broad-except
|
393
|
+
ctx.redirect_log(original_output)
|
394
|
+
api_requests.set_request_failed(request_id, e)
|
395
|
+
logger.error(f'Request {request_id} failed due to '
|
396
|
+
f'{common_utils.format_exception(e)}')
|
397
|
+
return True
|
398
|
+
return False
|
399
|
+
|
400
|
+
try:
|
401
|
+
while True:
|
402
|
+
res = await poll_task(request.request_id)
|
403
|
+
if res:
|
404
|
+
break
|
405
|
+
await asyncio.sleep(0.5)
|
406
|
+
except asyncio.CancelledError:
|
407
|
+
# Current coroutine is cancelled due to client disconnect, set the
|
408
|
+
# request status for consistency.
|
409
|
+
api_requests.set_request_cancelled(request.request_id)
|
410
|
+
pass
|
411
|
+
# pylint: disable=broad-except
|
412
|
+
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
413
|
+
# Handle any other error
|
414
|
+
ctx.redirect_log(original_output)
|
415
|
+
ctx.cancel()
|
416
|
+
api_requests.set_request_failed(request.request_id, e)
|
417
|
+
logger.error(f'Request {request.request_id} interrupted due to '
|
418
|
+
f'unhandled exception: {common_utils.format_exception(e)}')
|
419
|
+
raise
|
420
|
+
|
421
|
+
|
422
|
+
def prepare_request(
|
423
|
+
request_id: str,
|
424
|
+
request_name: str,
|
425
|
+
request_body: payloads.RequestBody,
|
426
|
+
func: Callable[P, Any],
|
427
|
+
request_cluster_name: Optional[str] = None,
|
428
|
+
schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
|
429
|
+
is_skypilot_system: bool = False,
|
430
|
+
) -> api_requests.Request:
|
431
|
+
"""Prepare a request for execution."""
|
432
|
+
user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
|
433
|
+
if is_skypilot_system:
|
434
|
+
user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
|
435
|
+
global_user_state.add_or_update_user(
|
436
|
+
models.User(id=user_id, name=user_id))
|
437
|
+
request = api_requests.Request(request_id=request_id,
|
438
|
+
name=server_constants.REQUEST_NAME_PREFIX +
|
439
|
+
request_name,
|
440
|
+
entrypoint=func,
|
441
|
+
request_body=request_body,
|
442
|
+
status=api_requests.RequestStatus.PENDING,
|
443
|
+
created_at=time.time(),
|
444
|
+
schedule_type=schedule_type,
|
445
|
+
user_id=user_id,
|
446
|
+
cluster_name=request_cluster_name)
|
447
|
+
|
448
|
+
if not api_requests.create_if_not_exists(request):
|
449
|
+
raise RuntimeError(f'Request {request_id} already exists.')
|
450
|
+
|
451
|
+
request.log_path.touch()
|
452
|
+
return request
|
453
|
+
|
454
|
+
|
344
455
|
def schedule_request(
|
345
456
|
request_id: str,
|
346
457
|
request_name: str,
|
@@ -372,27 +483,8 @@ def schedule_request(
|
|
372
483
|
The precondition is waited asynchronously and does not block the
|
373
484
|
caller.
|
374
485
|
"""
|
375
|
-
|
376
|
-
|
377
|
-
user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
|
378
|
-
global_user_state.add_or_update_user(
|
379
|
-
models.User(id=user_id, name=user_id))
|
380
|
-
request = api_requests.Request(request_id=request_id,
|
381
|
-
name=server_constants.REQUEST_NAME_PREFIX +
|
382
|
-
request_name,
|
383
|
-
entrypoint=func,
|
384
|
-
request_body=request_body,
|
385
|
-
status=api_requests.RequestStatus.PENDING,
|
386
|
-
created_at=time.time(),
|
387
|
-
schedule_type=schedule_type,
|
388
|
-
user_id=user_id,
|
389
|
-
cluster_name=request_cluster_name)
|
390
|
-
|
391
|
-
if not api_requests.create_if_not_exists(request):
|
392
|
-
logger.debug(f'Request {request_id} already exists.')
|
393
|
-
return
|
394
|
-
|
395
|
-
request.log_path.touch()
|
486
|
+
prepare_request(request_id, request_name, request_body, func,
|
487
|
+
request_cluster_name, schedule_type, is_skypilot_system)
|
396
488
|
|
397
489
|
def enqueue():
|
398
490
|
input_tuple = (request_id, ignore_return_value)
|
sky/server/requests/requests.py
CHANGED
@@ -606,3 +606,18 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
606
606
|
assert request_task is not None, request_id
|
607
607
|
request_task.status = RequestStatus.FAILED
|
608
608
|
request_task.set_error(e)
|
609
|
+
|
610
|
+
|
611
|
+
def set_request_succeeded(request_id: str, result: Any) -> None:
|
612
|
+
"""Set a request to succeeded and populate the result."""
|
613
|
+
with update_request(request_id) as request_task:
|
614
|
+
assert request_task is not None, request_id
|
615
|
+
request_task.status = RequestStatus.SUCCEEDED
|
616
|
+
request_task.set_return_value(result)
|
617
|
+
|
618
|
+
|
619
|
+
def set_request_cancelled(request_id: str) -> None:
|
620
|
+
"""Set a request to cancelled."""
|
621
|
+
with update_request(request_id) as request_task:
|
622
|
+
assert request_task is not None, request_id
|
623
|
+
request_task.status = RequestStatus.CANCELLED
|