skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20250513__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +3 -0
  3. sky/backends/cloud_vm_ray_backend.py +7 -0
  4. sky/cli.py +109 -109
  5. sky/client/cli.py +109 -109
  6. sky/clouds/gcp.py +35 -8
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → 2dkponv64SfFShA8Rnw0D}/_buildManifest.js +1 -1
  9. sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +1 -0
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/index.html +1 -1
  14. sky/dashboard/out/jobs/[job].html +1 -1
  15. sky/dashboard/out/jobs.html +1 -1
  16. sky/global_user_state.py +2 -0
  17. sky/provision/docker_utils.py +4 -1
  18. sky/provision/gcp/config.py +197 -15
  19. sky/provision/gcp/constants.py +64 -0
  20. sky/provision/gcp/instance.py +5 -3
  21. sky/provision/gcp/instance_utils.py +8 -4
  22. sky/provision/nebius/instance.py +3 -1
  23. sky/provision/nebius/utils.py +4 -2
  24. sky/server/requests/executor.py +114 -22
  25. sky/server/requests/requests.py +15 -0
  26. sky/server/server.py +12 -7
  27. sky/server/uvicorn.py +12 -2
  28. sky/sky_logging.py +40 -2
  29. sky/skylet/constants.py +3 -0
  30. sky/skylet/log_lib.py +51 -11
  31. sky/templates/gcp-ray.yml.j2 +11 -0
  32. sky/templates/nebius-ray.yml.j2 +4 -0
  33. sky/templates/websocket_proxy.py +29 -9
  34. sky/utils/command_runner.py +3 -0
  35. sky/utils/context.py +264 -0
  36. sky/utils/context_utils.py +172 -0
  37. sky/utils/rich_utils.py +81 -37
  38. sky/utils/schemas.py +9 -1
  39. sky/utils/subprocess_utils.py +8 -2
  40. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/METADATA +1 -5
  41. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/RECORD +46 -44
  42. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/WHEEL +1 -1
  43. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  44. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → 2dkponv64SfFShA8Rnw0D}/_ssgManifest.js +0 -0
  45. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/entry_points.txt +0 -0
  46. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/licenses/LICENSE +0 -0
  47. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="preload" href="/dashboard/skypilot.svg" as="image" fetchpriority="high"/><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-0f8017370869e269.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-f49500b82ad5392d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js" defer=""></script><script src="/dashboard/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LksQgChY5izXjokL3LcEu/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div class="min-h-screen bg-gray-50"><div class="fixed top-0 left-0 right-0 z-50 shadow-sm"><div class="fixed top-0 left-0 right-0 bg-white z-30 h-14 px-4 border-b border-gray-200 shadow-sm"><div class="flex items-center h-full"><div class="flex items-center space-x-4 mr-6"><a class="flex items-center px-1 pt-1 h-full" href="/dashboard"><div class="h-20 w-20 flex items-center justify-center"><img alt="SkyPilot Logo" fetchpriority="high" width="80" height="80" decoding="async" data-nimg="1" class="w-full h-full object-contain" style="color:transparent" src="/dashboard/skypilot.svg"/></div></a></div><div class="flex items-center space-x-2 md:space-x-6 mr-6"><a class="inline-flex items-center border-b-2 border-transparent hover:text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/clusters"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="20" height="8" x="2" y="2" rx="2" ry="2"></rect><rect width="20" height="8" x="2" y="14" rx="2" ry="2"></rect><line x1="6" x2="6.01" y1="6" y2="6"></line><line x1="6" x2="6.01" y1="18" y2="18"></line></svg><span>Clusters</span></a><a class="inline-flex items-center border-b-2 border-transparent text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/jobs"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 20V4a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path><rect width="20" height="14" x="2" y="6" rx="2"></rect></svg><span>Jobs</span></a><div class="inline-flex items-center px-1 pt-1 text-gray-400"><svg class="w-4 h-4" viewBox="0 0 423.683 423.683" width="24" height="24" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" fill="currentColor" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><g id="SVGRepo_bgCarrier" stroke-width="0"></g><g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"></g><g id="SVGRepo_iconCarrier"><g><path d="M54.376,287.577h310.459c26.48,0,48.02-13.979,48.02-40.453c0-17.916-10.001-34.07-25.559-42.292 c-19.021-72.951-86.061-125.196-162.002-125.223v-3.431h-3.854V61.814h3.854v-9.569h-31.38v9.569h3.854v14.363h-3.854v3.431 c-75.941,0.026-142.97,52.272-161.988,125.217c-15.56,8.216-25.573,24.376-25.573,42.291 C6.36,273.597,27.896,287.577,54.376,287.577z M47.676,227.145l7.214-2.424l1.617-7.447 c13.884-64.232,71.707-110.862,137.467-110.862h31.274c65.763,0,123.582,46.63,137.473,110.862l1.607,7.447l7.223,2.424 c8.678,2.92,14.506,10.946,14.506,19.979c0,11.703-9.517,13.647-21.221,13.647H54.376c-11.7,0-21.22-1.944-21.22-13.647 C33.162,238.091,38.984,230.065,47.676,227.145z M423.683,334.602v36.836H0v-36.836h25.348v-18.418h372.99v18.418H423.683z"></path></g></g></svg><span class="ml-2">Services</span><span class="text-xs ml-2 px-1.5 py-0.5 bg-gray-100 text-gray-500 rounded">Soon</span></div></div><div class="flex items-center space-x-1 ml-auto"><a href="https://skypilot.readthedocs.io/en/latest/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center px-2 py-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer" title="Docs"><span class="mr-1">Docs</span><svg class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"></path><polyline points="15 3 21 3 21 9"></polyline><line x1="10" y1="14" x2="21" y2="3"></line></svg></a><div class="border-l border-gray-200 h-6 mx-1"></div><a href="https://github.com/skypilot-org/skypilot" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="GitHub"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"></path></svg></a><a href="https://slack.skypilot.co/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Slack"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path transform="scale(0.85) translate(1.8, 1.8)" d="M5.042 15.165a2.528 2.528 0 0 1-2.52 2.523A2.528 2.528 0 0 1 0 15.165a2.527 2.527 0 0 1 2.522-2.52h2.52v2.52zM6.313 15.165a2.527 2.527 0 0 1 2.521-2.52 2.527 2.527 0 0 1 2.521 2.52v6.313A2.528 2.528 0 0 1 8.834 24a2.528 2.528 0 0 1-2.521-2.522v-6.313zM8.834 5.042a2.528 2.528 0 0 1-2.521-2.52A2.528 2.528 0 0 1 8.834 0a2.528 2.528 0 0 1 2.521 2.522v2.52H8.834zM8.834 6.313a2.528 2.528 0 0 1 2.521 2.521 2.528 2.528 0 0 1-2.521 2.521H2.522A2.528 2.528 0 0 1 0 8.834a2.528 2.528 0 0 1 2.522-2.521h6.312zM18.956 8.834a2.528 2.528 0 0 1 2.522-2.521A2.528 2.528 0 0 1 24 8.834a2.528 2.528 0 0 1-2.522 2.521h-2.522V8.834zM17.688 8.834a2.528 2.528 0 0 1-2.523 2.521 2.527 2.527 0 0 1-2.52-2.521V2.522A2.527 2.527 0 0 1 15.165 0a2.528 2.528 0 0 1 2.523 2.522v6.312zM15.165 18.956a2.528 2.528 0 0 1 2.523 2.522A2.528 2.528 0 0 1 15.165 24a2.527 2.527 0 0 1-2.52-2.522v-2.522h2.52zM15.165 17.688a2.527 2.527 0 0 1-2.52-2.523 2.526 2.526 0 0 1 2.52-2.52h6.313A2.527 2.527 0 0 1 24 15.165a2.528 2.528 0 0 1-2.522 2.523h-6.313z"></path></svg></a><a href="https://github.com/skypilot-org/skypilot/issues/new" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Leave Feedback"><svg class="w-5 h-5" stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g><path fill="none" d="M0 0h24v24H0z"></path><path d="M6.455 19L2 22.5V4a1 1 0 0 1 1-1h18a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H6.455zM4 18.385L5.763 17H20V5H4v13.385zM11 13h2v2h-2v-2zm0-6h2v5h-2V7z"></path></g></svg></a></div></div></div></div><div class="transition-all duration-200 ease-in-out min-h-screen" style="padding-top:56px"><main class="p-6"><div class="flex items-center justify-between mb-4 h-5"><div class="text-base"><a class="text-sky-blue hover:underline leading-none" href="/dashboard/jobs">Managed Jobs</a></div><div class="flex items-center space-x-2"><button class="inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 hover:bg-accent h-9 rounded-md px-3 text-sky-blue hover:text-sky-blue-bright" title="Refresh"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-rotate-cw h-4 w-4 mr-1.5"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg><span>Refresh</span></button></div></div><div class="relative"><div class="flex flex-col space-y-1 mb-1"><div class="flex flex-wrap items-center text-sm mb-1"><span class="mr-2 text-sm font-medium">Statuses:</span><div class="flex flex-wrap gap-2 items-center"></div></div></div><div class="rounded-lg border bg-card text-card-foreground shadow-sm"><div class="relative w-full overflow-auto"><table class="w-full caption-bottom text-base"><thead class="[&amp;_tr]:border-b"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">ID</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Name</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">User</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Submitted</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Duration</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Status</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Resources</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Cluster</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Region</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Recoveries</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0">Details</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0">Logs</th></tr></thead><tbody class="[&amp;_tr:last-child]:border-0"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><td class="p-4 align-middle [&amp;:has([role=checkbox])]:pr-0 text-center py-6" colSpan="12"><div class="flex flex-col items-center space-y-4"><p class="text-gray-500">No active jobs</p></div></td></tr></tbody></table></div></div></div></main></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"LksQgChY5izXjokL3LcEu","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><link rel="preload" href="/dashboard/skypilot.svg" as="image" fetchpriority="high"/><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-0ca6f2c1ba667c3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-f49500b82ad5392d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js" defer=""></script><script src="/dashboard/_next/static/2dkponv64SfFShA8Rnw0D/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2dkponv64SfFShA8Rnw0D/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div class="min-h-screen bg-gray-50"><div class="fixed top-0 left-0 right-0 z-50 shadow-sm"><div class="fixed top-0 left-0 right-0 bg-white z-30 h-14 px-4 border-b border-gray-200 shadow-sm"><div class="flex items-center h-full"><div class="flex items-center space-x-4 mr-6"><a class="flex items-center px-1 pt-1 h-full" href="/dashboard"><div class="h-20 w-20 flex items-center justify-center"><img alt="SkyPilot Logo" fetchpriority="high" width="80" height="80" decoding="async" data-nimg="1" class="w-full h-full object-contain" style="color:transparent" src="/dashboard/skypilot.svg"/></div></a></div><div class="flex items-center space-x-2 md:space-x-6 mr-6"><a class="inline-flex items-center border-b-2 border-transparent hover:text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/clusters"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="20" height="8" x="2" y="2" rx="2" ry="2"></rect><rect width="20" height="8" x="2" y="14" rx="2" ry="2"></rect><line x1="6" x2="6.01" y1="6" y2="6"></line><line x1="6" x2="6.01" y1="18" y2="18"></line></svg><span>Clusters</span></a><a class="inline-flex items-center border-b-2 border-transparent text-blue-600 px-1 pt-1 space-x-2" href="/dashboard/jobs"><svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 20V4a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path><rect width="20" height="14" x="2" y="6" rx="2"></rect></svg><span>Jobs</span></a><div class="inline-flex items-center px-1 pt-1 text-gray-400"><svg class="w-4 h-4" viewBox="0 0 423.683 423.683" width="24" height="24" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" fill="currentColor" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><g id="SVGRepo_bgCarrier" stroke-width="0"></g><g id="SVGRepo_tracerCarrier" stroke-linecap="round" stroke-linejoin="round"></g><g id="SVGRepo_iconCarrier"><g><path d="M54.376,287.577h310.459c26.48,0,48.02-13.979,48.02-40.453c0-17.916-10.001-34.07-25.559-42.292 c-19.021-72.951-86.061-125.196-162.002-125.223v-3.431h-3.854V61.814h3.854v-9.569h-31.38v9.569h3.854v14.363h-3.854v3.431 c-75.941,0.026-142.97,52.272-161.988,125.217c-15.56,8.216-25.573,24.376-25.573,42.291 C6.36,273.597,27.896,287.577,54.376,287.577z M47.676,227.145l7.214-2.424l1.617-7.447 c13.884-64.232,71.707-110.862,137.467-110.862h31.274c65.763,0,123.582,46.63,137.473,110.862l1.607,7.447l7.223,2.424 c8.678,2.92,14.506,10.946,14.506,19.979c0,11.703-9.517,13.647-21.221,13.647H54.376c-11.7,0-21.22-1.944-21.22-13.647 C33.162,238.091,38.984,230.065,47.676,227.145z M423.683,334.602v36.836H0v-36.836h25.348v-18.418h372.99v18.418H423.683z"></path></g></g></svg><span class="ml-2">Services</span><span class="text-xs ml-2 px-1.5 py-0.5 bg-gray-100 text-gray-500 rounded">Soon</span></div></div><div class="flex items-center space-x-1 ml-auto"><a href="https://skypilot.readthedocs.io/en/latest/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center px-2 py-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer" title="Docs"><span class="mr-1">Docs</span><svg class="w-3.5 h-3.5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"></path><polyline points="15 3 21 3 21 9"></polyline><line x1="10" y1="14" x2="21" y2="3"></line></svg></a><div class="border-l border-gray-200 h-6 mx-1"></div><a href="https://github.com/skypilot-org/skypilot" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="GitHub"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"></path></svg></a><a href="https://slack.skypilot.co/" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Slack"><svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path transform="scale(0.85) translate(1.8, 1.8)" d="M5.042 15.165a2.528 2.528 0 0 1-2.52 2.523A2.528 2.528 0 0 1 0 15.165a2.527 2.527 0 0 1 2.522-2.52h2.52v2.52zM6.313 15.165a2.527 2.527 0 0 1 2.521-2.52 2.527 2.527 0 0 1 2.521 2.52v6.313A2.528 2.528 0 0 1 8.834 24a2.528 2.528 0 0 1-2.521-2.522v-6.313zM8.834 5.042a2.528 2.528 0 0 1-2.521-2.52A2.528 2.528 0 0 1 8.834 0a2.528 2.528 0 0 1 2.521 2.522v2.52H8.834zM8.834 6.313a2.528 2.528 0 0 1 2.521 2.521 2.528 2.528 0 0 1-2.521 2.521H2.522A2.528 2.528 0 0 1 0 8.834a2.528 2.528 0 0 1 2.522-2.521h6.312zM18.956 8.834a2.528 2.528 0 0 1 2.522-2.521A2.528 2.528 0 0 1 24 8.834a2.528 2.528 0 0 1-2.522 2.521h-2.522V8.834zM17.688 8.834a2.528 2.528 0 0 1-2.523 2.521 2.527 2.527 0 0 1-2.52-2.521V2.522A2.527 2.527 0 0 1 15.165 0a2.528 2.528 0 0 1 2.523 2.522v6.312zM15.165 18.956a2.528 2.528 0 0 1 2.523 2.522A2.528 2.528 0 0 1 15.165 24a2.527 2.527 0 0 1-2.52-2.522v-2.522h2.52zM15.165 17.688a2.527 2.527 0 0 1-2.52-2.523 2.526 2.526 0 0 1 2.52-2.52h6.313A2.527 2.527 0 0 1 24 15.165a2.528 2.528 0 0 1-2.522 2.523h-6.313z"></path></svg></a><a href="https://github.com/skypilot-org/skypilot/issues/new" target="_blank" rel="noopener noreferrer" class="inline-flex items-center justify-center p-2 rounded-full text-gray-600 hover:bg-gray-100 transition-colors duration-150 cursor-pointer" title="Leave Feedback"><svg class="w-5 h-5" stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g><path fill="none" d="M0 0h24v24H0z"></path><path d="M6.455 19L2 22.5V4a1 1 0 0 1 1-1h18a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H6.455zM4 18.385L5.763 17H20V5H4v13.385zM11 13h2v2h-2v-2zm0-6h2v5h-2V7z"></path></g></svg></a></div></div></div></div><div class="transition-all duration-200 ease-in-out min-h-screen" style="padding-top:56px"><main class="p-6"><div class="flex items-center justify-between mb-4 h-5"><div class="text-base"><a class="text-sky-blue hover:underline leading-none" href="/dashboard/jobs">Managed Jobs</a></div><div class="flex items-center space-x-2"><button class="inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 hover:bg-accent h-9 rounded-md px-3 text-sky-blue hover:text-sky-blue-bright" title="Refresh"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-rotate-cw h-4 w-4 mr-1.5"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg><span>Refresh</span></button></div></div><div class="relative"><div class="flex flex-col space-y-1 mb-1"><div class="flex flex-wrap items-center text-sm mb-1"><span class="mr-2 text-sm font-medium">Statuses:</span><div class="flex flex-wrap gap-2 items-center"></div></div></div><div class="rounded-lg border bg-card text-card-foreground shadow-sm"><div class="relative w-full overflow-auto"><table class="w-full caption-bottom text-base"><thead class="[&amp;_tr]:border-b"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">ID</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Name</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">User</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Submitted</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Duration</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Status</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Resources</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Cluster</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Region</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0 sortable whitespace-nowrap">Recoveries</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0">Details</th><th class="h-12 px-4 text-left align-middle font-medium text-[hsl(var(--text-strong))] [&amp;:has([role=checkbox])]:pr-0">Logs</th></tr></thead><tbody class="[&amp;_tr:last-child]:border-0"><tr class="border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted"><td class="p-4 align-middle [&amp;:has([role=checkbox])]:pr-0 text-center py-6" colSpan="12"><div class="flex flex-col items-center space-y-4"><p class="text-gray-500">No active jobs</p></div></td></tr></tbody></table></div></div></div></main></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/jobs","query":{},"buildId":"2dkponv64SfFShA8Rnw0D","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -19,6 +19,7 @@ import uuid
19
19
  from sky import models
20
20
  from sky import sky_logging
21
21
  from sky.utils import common_utils
22
+ from sky.utils import context_utils
22
23
  from sky.utils import db_utils
23
24
  from sky.utils import registry
24
25
  from sky.utils import status_lib
@@ -671,6 +672,7 @@ def _load_storage_mounts_metadata(
671
672
  return pickle.loads(record_storage_mounts_metadata)
672
673
 
673
674
 
675
+ @context_utils.cancellation_guard
674
676
  def get_cluster_from_name(
675
677
  cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
676
678
  rows = _DB.cursor.execute(
@@ -343,9 +343,12 @@ class DockerInitializer:
343
343
  # `mesg: ttyname failed: inappropriate ioctl for device`.
344
344
  # see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
345
345
  port = constants.DEFAULT_DOCKER_PORT
346
+ # In case the port is already configured in the sshd_config file
347
+ # in some images, we delete it first and then append the new one.
346
348
  # pylint: disable=anomalous-backslash-in-string
347
349
  self._run(
348
- f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;'
350
+ 'sudo sed -i "/^Port .*/d" /etc/ssh/sshd_config;'
351
+ f'sudo echo "Port {port}" >> /etc/ssh/sshd_config;'
349
352
  'mkdir -p ~/.ssh;'
350
353
  'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
351
354
  'sudo service ssh start;'
@@ -75,6 +75,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
75
75
  return result
76
76
 
77
77
 
78
+ def wait_for_compute_region_operation(project_name, region, operation, compute):
79
+ """Poll for region compute operation until finished."""
80
+ logger.info('wait_for_compute_region_operation: '
81
+ 'Waiting for operation {} to finish...'.format(
82
+ operation['name']))
83
+
84
+ for _ in range(constants.MAX_POLLS):
85
+ result = (compute.regionOperations().get(
86
+ project=project_name,
87
+ region=region,
88
+ operation=operation['name'],
89
+ ).execute())
90
+ if 'error' in result:
91
+ raise Exception(result['error'])
92
+
93
+ if result['status'] == 'DONE':
94
+ logger.info('wait_for_compute_region_operation: Operation done.')
95
+ break
96
+
97
+ time.sleep(constants.POLL_INTERVAL)
98
+
99
+ return result
100
+
101
+
78
102
  def _create_crm(gcp_credentials=None):
79
103
  return gcp.build('cloudresourcemanager',
80
104
  'v1',
@@ -168,6 +192,7 @@ def bootstrap_instances(
168
192
  iam_role = _configure_iam_role(config, crm, iam)
169
193
  config.node_config.update(iam_role)
170
194
  config = _configure_subnet(region, cluster_name, config, compute)
195
+ config = _configure_placement_policy(region, cluster_name, config, compute)
171
196
 
172
197
  return config
173
198
 
@@ -660,6 +685,95 @@ def get_usable_vpc_and_subnet(
660
685
  return usable_vpc_name, usable_subnet
661
686
 
662
687
 
688
+ def get_gpu_direct_usable_vpcs_and_subnets(
689
+ cluster_name: str,
690
+ region: str,
691
+ config: common.ProvisionConfig,
692
+ compute,
693
+ ) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
694
+ """Return a list of usable VPCs and subnets for GPU Direct."""
695
+ project_id = config.provider_config['project_id']
696
+ vpc_prefix = constants.SKYPILOT
697
+ cluster_prefix = cluster_name[:constants.CLUSTER_PREFIX_LENGTH]
698
+ vpc_subnet_pairs = []
699
+
700
+ # TODO(hailong): Determine the num_vpcs per different GPU Direct types
701
+ num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
702
+
703
+ cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
704
+ for i in range(num_vpcs):
705
+ if i == 0:
706
+ vpc_name = f'{vpc_prefix}-{cluster_prefix}-mgmt-net'
707
+ else:
708
+ vpc_name = f'{vpc_prefix}-{cluster_prefix}-data-net-{i}'
709
+ subnet_name = f'{vpc_name}-sub'
710
+ subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
711
+ # Check if VPC exists
712
+ vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
713
+ if not vpc_list:
714
+ body = constants.VPC_TEMPLATE.copy()
715
+ body['mtu'] = 8244
716
+ body['autoCreateSubnetworks'] = False
717
+ body['name'] = vpc_name
718
+ body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
719
+ VPC_NAME=vpc_name)
720
+ _create_vpcnet(project_id, compute, body)
721
+ # Check if subnet exists
722
+ subnets = _list_subnets(project_id, region, compute, network=vpc_name)
723
+ if not subnets:
724
+ _create_subnet(project_id, region, compute, vpc_name, subnet_name,
725
+ subnet_cidr_range)
726
+ subnets = _list_subnets(project_id,
727
+ region,
728
+ compute,
729
+ network=vpc_name)
730
+ # Apply firewall rules
731
+ _create_rules(project_id, compute, constants.FIREWALL_RULES_TEMPLATE,
732
+ vpc_name)
733
+ vpc_subnet_pairs.append((vpc_name, subnets[0]))
734
+ return vpc_subnet_pairs
735
+
736
+
737
+ def _configure_placement_policy(region: str, cluster_name: str,
738
+ config: common.ProvisionConfig, compute):
739
+ """Configure placement group for GPU Direct."""
740
+ node_config = config.node_config
741
+ project_id = config.provider_config['project_id']
742
+ group_placement_policy = config.provider_config.get('placement_policy',
743
+ None)
744
+ # If the placement policy is not compact,
745
+ # or the managed instance group is specified,
746
+ # skip the placement policy creation.
747
+ # If placement policy is specified together with managed instance group,
748
+ # it will cause the following error:
749
+ # Reason: [{'code': 'UNSUPPORTED_OPERATION',
750
+ # 'message': 'Creating queued resource with
751
+ # resource policies is not supported.'}]
752
+ mig_configuration = config.provider_config.get('use_managed_instance_group',
753
+ False)
754
+ if (group_placement_policy is None or group_placement_policy.lower() !=
755
+ constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
756
+ return config
757
+
758
+ cluster_prefix = cluster_name[:constants.CLUSTER_PREFIX_LENGTH]
759
+ policy_name = f'{cluster_prefix}-placement-policy'
760
+ resource_policy = {
761
+ 'name': policy_name,
762
+ 'groupPlacementPolicy': {
763
+ 'collocation': constants.COLLOCATED_COLLOCATION,
764
+ }
765
+ }
766
+ # Try to get the placement policy first, if not found, create it
767
+ placement_policy = _get_placement_policy(project_id, region, compute,
768
+ policy_name)
769
+ if not placement_policy:
770
+ logger.info(f'Creating placement policy {policy_name}'
771
+ f' for cluster {cluster_name}')
772
+ _create_placement_policy(project_id, region, compute, resource_policy)
773
+ node_config['resourcePolicies'] = [policy_name]
774
+ return config
775
+
776
+
663
777
  def _configure_subnet(region: str, cluster_name: str,
664
778
  config: common.ProvisionConfig, compute):
665
779
  """Pick a reasonable subnet if not specified by the config."""
@@ -671,25 +785,54 @@ def _configure_subnet(region: str, cluster_name: str,
671
785
  if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
672
786
  return config
673
787
 
674
- # SkyPilot: make sure there's a usable VPC
675
- _, default_subnet = get_usable_vpc_and_subnet(cluster_name, region, config,
676
- compute)
677
-
678
- default_interfaces = [{
679
- 'subnetwork': default_subnet['selfLink'],
680
- 'accessConfigs': [{
681
- 'name': 'External NAT',
682
- 'type': 'ONE_TO_ONE_NAT',
683
- }]
684
- }]
685
- # Add gVNIC if specified in config
788
+ default_interfaces = []
789
+ enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
686
790
  enable_gvnic = config.provider_config.get('enable_gvnic', False)
687
- if enable_gvnic:
688
- default_interfaces[0]['nicType'] = 'gVNIC'
791
+ if enable_gpu_direct:
792
+ if not enable_gvnic:
793
+ logger.warning(
794
+ 'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
795
+ )
796
+ config.provider_config['enable_gvnic'] = True
797
+ enable_gvnic = True
798
+ if 'machineType' not in node_config or node_config[
799
+ 'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
800
+ raise ValueError(
801
+ 'Enable GPU Direct requires machineType to be one of '
802
+ f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
803
+ logger.info(f'Enable GPU Direct for cluster {cluster_name} '
804
+ f'with machineType {node_config["machineType"]}')
805
+ vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
806
+ cluster_name, region, config, compute)
807
+ for _, subnet in vpc_subnet_pairs:
808
+ default_interfaces.append({
809
+ 'subnetwork': subnet['selfLink'],
810
+ 'accessConfigs': [{
811
+ 'name': 'External NAT',
812
+ 'type': 'ONE_TO_ONE_NAT',
813
+ }],
814
+ 'nicType': 'gVNIC'
815
+ })
816
+ else:
817
+ # SkyPilot: make sure there's a usable VPC
818
+ _, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
819
+ config, compute)
820
+
821
+ default_interfaces = [{
822
+ 'subnetwork': default_subnet['selfLink'],
823
+ 'accessConfigs': [{
824
+ 'name': 'External NAT',
825
+ 'type': 'ONE_TO_ONE_NAT',
826
+ }]
827
+ }]
828
+ # Add gVNIC if specified in config
829
+ if enable_gvnic:
830
+ default_interfaces[0]['nicType'] = 'gVNIC'
689
831
  enable_external_ips = _enable_external_ips(config)
690
832
  if not enable_external_ips:
691
833
  # Removing this key means the VM will not be assigned an external IP.
692
- default_interfaces[0].pop('accessConfigs')
834
+ for interface in default_interfaces:
835
+ interface.pop('accessConfigs')
693
836
 
694
837
  # The not applicable key will be removed during node creation
695
838
 
@@ -840,3 +983,42 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
840
983
  ).execute())
841
984
 
842
985
  return result
986
+
987
+
988
+ def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
989
+ subnet_name: str, ip_cidr_range: str):
990
+ body = {
991
+ 'name': subnet_name,
992
+ 'ipCidrRange': ip_cidr_range,
993
+ 'network': f'projects/{project_id}/global/networks/{vpc_name}',
994
+ 'region': region,
995
+ }
996
+ operation = compute.subnetworks().insert(project=project_id,
997
+ region=region,
998
+ body=body).execute()
999
+ response = wait_for_compute_region_operation(project_id, region, operation,
1000
+ compute)
1001
+ return response
1002
+
1003
+
1004
+ def _create_placement_policy(project_id: str, region: str, compute,
1005
+ placement_policy: dict):
1006
+ operation = compute.resourcePolicies().insert(
1007
+ project=project_id, region=region, body=placement_policy).execute()
1008
+ response = wait_for_compute_region_operation(project_id, region, operation,
1009
+ compute)
1010
+ return response
1011
+
1012
+
1013
+ def _get_placement_policy(project_id: str, region: str, compute, name: str):
1014
+ try:
1015
+ placement_policy = (compute.resourcePolicies().get(
1016
+ project=project_id,
1017
+ region=region,
1018
+ resourcePolicy=name,
1019
+ ).execute())
1020
+ except gcp.http_error_exception() as e:
1021
+ if e.resp.status == 404:
1022
+ return None
1023
+ raise
1024
+ return placement_policy
@@ -41,6 +41,70 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
41
41
  # with ServiceAccounts.
42
42
 
43
43
  SKYPILOT_VPC_NAME = 'skypilot-vpc'
44
+ SKYPILOT_GPU_DIRECT_VPC_NUM = 5
45
+ SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
46
+ GPU_DIRECT_TCPX_INSTANCE_TYPES = [
47
+ 'a3-edgegpu-8g',
48
+ 'a3-highgpu-8g',
49
+ ]
50
+ # The prefix length of the cluster name.
51
+ # To make sure the VPC and subnet names are within the GCP limits.
52
+ CLUSTER_PREFIX_LENGTH = 10
53
+
54
+ COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
55
+ COLLOCATED_COLLOCATION = 'COLLOCATED'
56
+ GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
57
+ set -e
58
+ set -x
59
+ # Install GPU Direct TCPX
60
+ cos-extensions install gpu -- --version=latest;
61
+ sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
62
+ sudo mount -o remount,exec /var/lib/nvidia;
63
+ docker ps -a | grep -q receive-datapath-manager || \
64
+ docker run \
65
+ --detach \
66
+ --pull=always \
67
+ --name receive-datapath-manager \
68
+ --privileged \
69
+ --cap-add=NET_ADMIN --network=host \
70
+ --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
71
+ --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
72
+ --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
73
+ --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
74
+ --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
75
+ --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
76
+ --env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
77
+ --volume /run/tcpx:/run/tcpx \
78
+ --entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
79
+ us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
80
+ --gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
81
+ sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
82
+ docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
83
+ sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
84
+ sudo mount -o remount,exec /var/lib/tcpx;
85
+ echo "GPU Direct TCPX installed"
86
+ """
87
+
88
+ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
89
+ '--cap-add=IPC_LOCK',
90
+ '--userns=host',
91
+ '--volume /run/tcpx:/run/tcpx',
92
+ '--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
93
+ '--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
94
+ '--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
95
+ '--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
96
+ '--device /dev/nvidia0:/dev/nvidia0',
97
+ '--device /dev/nvidia1:/dev/nvidia1',
98
+ '--device /dev/nvidia2:/dev/nvidia2',
99
+ '--device /dev/nvidia3:/dev/nvidia3',
100
+ '--device /dev/nvidia4:/dev/nvidia4',
101
+ '--device /dev/nvidia5:/dev/nvidia5',
102
+ '--device /dev/nvidia6:/dev/nvidia6',
103
+ '--device /dev/nvidia7:/dev/nvidia7',
104
+ '--device /dev/nvidia-uvm:/dev/nvidia-uvm',
105
+ '--device /dev/nvidiactl:/dev/nvidiactl',
106
+ '--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
107
+ ]
44
108
 
45
109
  # Below parameters are from the default VPC on GCP.
46
110
  # https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
@@ -530,9 +530,11 @@ def terminate_instances(
530
530
  use_mig = provider_config.get('use_managed_instance_group', False)
531
531
  if use_mig:
532
532
  # Deleting the MIG will also delete the instances.
533
- instance_utils.GCPManagedInstanceGroup.delete_mig(
534
- project_id, zone, cluster_name_on_cloud)
535
- return
533
+ mig_exists_and_deleted = (
534
+ instance_utils.GCPManagedInstanceGroup.delete_mig(
535
+ project_id, zone, cluster_name_on_cloud))
536
+ if mig_exists_and_deleted:
537
+ return
536
538
 
537
539
  label_filters = {
538
540
  provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
@@ -1125,12 +1125,14 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1125
1125
  if re.search(mig_utils.IT_RESOURCE_NOT_FOUND_PATTERN,
1126
1126
  str(e)) is None:
1127
1127
  raise
1128
- logger.warning(
1128
+ logger.debug(
1129
1129
  f'Instance template {instance_template_name!r} does not exist. '
1130
1130
  'Skip deletion.')
1131
1131
 
1132
1132
  @classmethod
1133
- def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> None:
1133
+ def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> bool:
1134
+ """Returns whether the MIG is deleted successfully."""
1135
+ mig_exists_and_deleted = True
1134
1136
  mig_name = mig_utils.get_managed_instance_group_name(cluster_name)
1135
1137
  # Get all resize request of the MIG and cancel them.
1136
1138
  mig_utils.cancel_all_resize_request_for_mig(project_id, zone, mig_name)
@@ -1144,8 +1146,9 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1144
1146
  if re.search(mig_utils.MIG_RESOURCE_NOT_FOUND_PATTERN,
1145
1147
  str(e)) is None:
1146
1148
  raise
1147
- logger.warning(f'MIG {mig_name!r} does not exist. Skip '
1148
- 'deletion.')
1149
+ logger.debug(f'MIG {mig_name!r} does not exist. Skip '
1150
+ 'deletion.')
1151
+ mig_exists_and_deleted = False
1149
1152
 
1150
1153
  # In the autostop case, the following deletion of instance template
1151
1154
  # will not be executed as the instance that runs the deletion will be
@@ -1156,6 +1159,7 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1156
1159
  cls._delete_instance_template(
1157
1160
  project_id, zone,
1158
1161
  mig_utils.get_instance_template_name(cluster_name))
1162
+ return mig_exists_and_deleted
1159
1163
 
1160
1164
  @classmethod
1161
1165
  def _add_labels_and_find_head(
@@ -132,7 +132,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
132
132
  region=region,
133
133
  image_family=config.node_config['ImageId'],
134
134
  disk_size=config.node_config['DiskSize'],
135
- user_data=config.node_config['UserData'])
135
+ user_data=config.node_config['UserData'],
136
+ associate_public_ip_address=(
137
+ not config.provider_config['use_internal_ips']))
136
138
  except Exception as e: # pylint: disable=broad-except
137
139
  logger.warning(f'run_instances error: {e}')
138
140
  raise
@@ -158,7 +158,7 @@ def start(instance_id: str) -> None:
158
158
 
159
159
  def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
160
160
  preset: str, region: str, image_family: str, disk_size: int,
161
- user_data: str) -> str:
161
+ user_data: str, associate_public_ip_address: bool) -> str:
162
162
  # Each node must have a unique name to avoid conflicts between
163
163
  # multiple worker VMs. To ensure uniqueness,a UUID is appended
164
164
  # to the node name.
@@ -242,7 +242,9 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
242
242
  subnet_id=sub_net.items[0].metadata.id,
243
243
  ip_address=nebius.compute().IPAddress(),
244
244
  name='network-interface-0',
245
- public_ip_address=nebius.compute().PublicIPAddress())
245
+ public_ip_address=nebius.compute().PublicIPAddress()
246
+ if associate_public_ip_address else None,
247
+ )
246
248
  ]))).wait()
247
249
  instance_id = ''
248
250
  retry_count = 0