skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +50 -2
  8. sky/clouds/__init__.py +3 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +24 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +4 -2
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/constants.py +1 -1
  25. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  26. sky/clouds/ssh.py +203 -0
  27. sky/clouds/vast.py +2 -1
  28. sky/clouds/vsphere.py +2 -1
  29. sky/core.py +53 -9
  30. sky/dashboard/out/404.html +1 -1
  31. sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  33. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  34. sky/dashboard/out/clusters/[cluster].html +1 -1
  35. sky/dashboard/out/clusters.html +1 -1
  36. sky/dashboard/out/index.html +1 -1
  37. sky/dashboard/out/infra.html +1 -1
  38. sky/dashboard/out/jobs/[job].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/optimizer.py +23 -4
  41. sky/provision/__init__.py +1 -0
  42. sky/provision/aws/instance.py +17 -1
  43. sky/provision/kubernetes/instance.py +16 -5
  44. sky/provision/kubernetes/utils.py +37 -19
  45. sky/provision/nebius/instance.py +3 -1
  46. sky/provision/nebius/utils.py +14 -2
  47. sky/provision/ssh/__init__.py +18 -0
  48. sky/resources.py +4 -1
  49. sky/server/requests/payloads.py +7 -0
  50. sky/server/server.py +40 -0
  51. sky/setup_files/dependencies.py +1 -0
  52. sky/templates/nebius-ray.yml.j2 +12 -0
  53. sky/utils/infra_utils.py +21 -1
  54. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  55. sky/utils/kubernetes/create_cluster.sh +1 -0
  56. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  57. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  58. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  59. sky/utils/log_utils.py +214 -1
  60. sky/utils/schemas.py +21 -0
  61. sky/utils/ux_utils.py +2 -1
  62. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  63. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +68 -63
  64. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  65. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  66. /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  67. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  68. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  69. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  70. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -1 +0,0 @@
1
- (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[588],{6206:function(e,t,r){(window.__NEXT_P=window.__NEXT_P||[]).push(["/infra",function(){return r(1090)}])},3266:function(e,t,r){"use strict";r.d(t,{QL:function(){return d},Sl:function(){return c},zd:function(){return o}});var s=r(7294),a=r(5821),n=r(3225);let l={UP:"RUNNING",STOPPED:"STOPPED",INIT:"LAUNCHING",null:"TERMINATED"};async function o(){let{clusterNames:e=null}=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};try{let t=await fetch("".concat(n.f4,"/status"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({cluster_names:e,all_users:!0})}),r=t.headers.get("X-Skypilot-Request-ID")||t.headers.get("X-Request-ID"),s=await fetch("".concat(n.f4,"/api/get?request_id=").concat(r)),a=await s.json();return(a.return_value?JSON.parse(a.return_value):[]).map(e=>{let t="",r=t=e.zone?e.zone:e.region;return t&&t.length>25&&(t=function(e){let t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:15;if(!e||e.length<=t)return e;if(t<=3)return"...";let r=Math.floor((t-3)/2),s=r+(t-3)%2;return 0===r?e.substring(0,s)+"...":e.substring(0,s)+"..."+e.substring(e.length-r)}(t,25)),{status:l[e.status],cluster:e.name,user:e.user_name,cloud:e.cloud,infra:t?e.cloud+" ("+t+")":e.cloud,full_infra:r?"".concat(e.cloud," (").concat(r,")"):e.cloud,cpus:e.cpus,mem:e.memory,gpus:e.accelerators,resources_str:e.resources_str,resources_str_full:e.resources_str_full,time:new Date(1e3*e.launched_at),num_nodes:e.nodes,jobs:[],events:[{time:new Date(1e3*e.launched_at),event:"Cluster created."}]}})}catch(e){return console.error("Error fetching clusters:",e),[]}}async function c(e){let{clusterName:t,jobId:r,onNewLog:s}=e;try{let e=(await fetch("".concat(n.f4,"/logs"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({follow:!1,cluster_name:t,job_id:r})})).body.getReader();for(;;){let{done:t,value:r}=await e.read();if(t)break;let a=new TextDecoder().decode(r);s(a)}}catch(e){console.error("Error in streamClusterJobLogs:",e),(0,a.C)("Error in streamClusterJobLogs: ".concat(e.message),"error")}}async function i(e){let{clusterName:t}=e;try{let e=await fetch("".concat(n.f4,"/queue"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({cluster_name:t,all_users:!0})}),r=e.headers.get("X-Skypilot-Request-ID")||e.headers.get("X-Request-ID"),s=await fetch("".concat(n.f4,"/api/get?request_id=").concat(r)),a=await s.json();return JSON.parse(a.return_value).map(e=>{let r=e.end_at?e.end_at:Date.now()/1e3,s=0,a=0;return e.submitted_at&&(s=r-e.submitted_at),e.start_at&&(a=r-e.start_at),{id:e.job_id,status:e.status,job:e.job_name,user:e.username,gpus:e.accelerators||{},submitted_at:e.submitted_at?new Date(1e3*e.submitted_at):null,resources:e.resources,cluster:t,total_duration:s,job_duration:a,infra:"",logs:""}})}catch(e){return console.error("Error fetching cluster jobs:",e),[]}}function d(e){let{cluster:t,job:r=null}=e,[a,n]=(0,s.useState)(null),[l,c]=(0,s.useState)(null),[d,u]=(0,s.useState)(!0),[m,h]=(0,s.useState)(!0),f=(0,s.useCallback)(async()=>{if(t)try{u(!0);let e=await o({clusterNames:[t]});n(e[0])}catch(e){console.error("Error fetching cluster data:",e)}finally{u(!1)}},[t]),x=(0,s.useCallback)(async()=>{if(t)try{h(!0);let e=await i({clusterName:t});c(e)}catch(e){console.error("Error fetching cluster job data:",e)}finally{h(!1)}},[t]),g=(0,s.useCallback)(async()=>{await Promise.all([f(),x()])},[f,x]);return(0,s.useEffect)(()=>{f(),x()},[t,r,f,x]),{clusterData:a,clusterJobData:l,loading:d||m,refreshData:g}}},8969:function(e,t,r){"use strict";r.d(t,{Ce:function(){return i},NJ:function(){return c},Pr:function(){return o},Vp:function(){return l}});var s=r(7294),a=r(5821),n=r(3225);async function l(){let{allUsers:e=!0}=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};try{let t=(await fetch("".concat(n.f4,"/jobs/queue"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({all_users:e})})).headers.get("X-Skypilot-Request-ID"),r=await fetch("".concat(n.f4,"/api/get?request_id=").concat(t));if(500===r.status){try{let e=await r.json();if(e.detail&&e.detail.error)try{let t=JSON.parse(e.detail.error);if(t.type&&t.type===n.iW)return{jobs:[],controllerStopped:!0}}catch(e){console.error("Error parsing JSON:",e)}}catch(e){console.error("Error parsing JSON:",e)}return{jobs:[],controllerStopped:!1}}let s=await r.json();return{jobs:(s.return_value?JSON.parse(s.return_value):[]).map(e=>{let t=[];e.submitted_at&&t.push({time:new Date(1e3*e.submitted_at),event:"Job submitted."}),e.start_at&&t.push({time:new Date(1e3*e.start_at),event:"Job started."}),e.end_at&&("CANCELLING"==e.status||"CANCELLED"==e.status?t.push({time:new Date(1e3*e.end_at),event:"Job cancelled."}):t.push({time:new Date(1e3*e.end_at),event:"Job completed."})),e.last_recovered_at&&e.last_recovered_at!=e.start_at&&t.push({time:new Date(1e3*e.last_recovered_at),event:"Job recovered."});let r=(e.end_at?e.end_at:Date.now()/1e3)-e.submitted_at,s=e.cloud,a=e.cluster_resources;if(!s){if(e.cluster_resources&&"-"!==e.cluster_resources)try{s=e.cluster_resources.split("(")[0].split("x").pop().trim(),a=e.cluster_resources.replace("".concat(s,"("),"(").replace("x ","x")}catch(e){s="Unknown"}else s="Unknown"}let n="",l=n=e.zone?e.zone:e.region;n&&n.length>15&&(n=n.substring(0,15)+"...");let o=s+" ("+n+")";"-"===n&&(o=s);let c=s+" ("+l+")";return"-"===l&&(c=s),{id:e.job_id,task:e.task_name,name:e.job_name,job_duration:e.job_duration,total_duration:r,status:e.status,requested_resources:e.resources,resources_str:a,resources_str_full:e.cluster_resources_full||a,cloud:s,infra:o,full_infra:c,recoveries:e.recovery_count,details:e.failure_reason,user:e.user_name,submitted_at:e.submitted_at?new Date(1e3*e.submitted_at):null,events:t}}),controllerStopped:!1}}catch(e){return console.error("Error fetching managed job data:",e),{jobs:[],controllerStopped:!1}}}function o(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:0,[t,r]=(0,s.useState)(null),[a,n]=(0,s.useState)(!0);return(0,s.useEffect)(()=>{(async function(){try{n(!0);let e=await l({allUsers:!0});r(e)}catch(e){console.error("Error fetching managed job data:",e)}finally{n(!1)}})()},[e]),{jobData:t,loading:a}}async function c(e){let{jobId:t,controller:r=!1,signal:s,onNewLog:l}=e,o=new Promise(e=>{setTimeout(()=>{e({timeout:!0})},1e4)}),c=(async()=>{try{let e=(await fetch("".concat(n.f4,"/jobs/logs"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({controller:r,follow:!1,job_id:t}),...s?{signal:s}:{}})).body.getReader();try{for(;;){let{done:t,value:r}=await e.read();if(t)break;let s=new TextDecoder().decode(r);l(s)}}finally{e.cancel()}return{timeout:!1}}catch(e){if("AbortError"===e.name)return{timeout:!1};throw e}})();if((await Promise.race([c,o])).timeout){(0,a.C)("Log request for job ".concat(t," timed out after ").concat(1e4,"ms"),"error");return}}async function i(e,t,r){let s="",l="",o="",c={};if("restartcontroller"===e)s="Restarting",l="restarted",o="jobs/queue",c={all_users:!0,refresh:!0},t="controller";else throw Error("Invalid action: ".concat(e));(0,a.C)("".concat(s," job ").concat(t,"..."),"info");try{try{let e=(await fetch("".concat(n.f4,"/").concat(o),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(c)})).headers.get("X-Skypilot-Request-ID"),i=await fetch("".concat(n.f4,"/api/get?request_id=").concat(e));if(200===i.status)(0,a.C)("Job ".concat(t," ").concat(l," successfully."),"success");else if(500===i.status)try{let e=await i.json();if(e.detail&&e.detail.error)try{let l=JSON.parse(e.detail.error);l.type&&l.type===n.Bo?(0,a.C)("".concat(s," job ").concat(t," is not supported!"),"error",1e4):l.type&&l.type===n.mF?(0,a.C)("Cluster ".concat(r," does not exist."),"error"):l.type&&l.type===n.iW?(0,a.C)("Cluster ".concat(r," is not up."),"error"):(0,a.C)("".concat(s," job ").concat(t," failed: ").concat(l.type),"error")}catch(r){(0,a.C)("".concat(s," job ").concat(t," failed: ").concat(e.detail.error),"error")}else(0,a.C)("".concat(s," job ").concat(t," failed with no details."),"error")}catch(e){(0,a.C)("".concat(s," job ").concat(t," failed with parse error."),"error")}else(0,a.C)("".concat(s," job ").concat(t," failed with status ").concat(i.status,"."),"error")}catch(e){console.error("Fetch error:",e),(0,a.C)("Network error ".concat(s," job ").concat(t,": ").concat(e.message),"error")}}catch(e){console.error("Error in handleStop:",e),(0,a.C)("Critical error ".concat(s," job ").concat(t,": ").concat(e.message),"error")}}},1090:function(e,t,r){"use strict";r.r(t),r.d(t,{default:function(){return v}});var s=r(5893),a=r(7294),n=r(8799),l=r(9470),o=r(3626),c=r(3001),i=r(3225),d=r(3266),u=r(8969);async function m(){try{let[e,t]=await Promise.all([(0,d.zd)(),(0,u.Vp)()]),r=(null==t?void 0:t.jobs)||[],s=[];try{let e=await fetch("".concat(i.f4,"/enabled_clouds"),{method:"GET",headers:{"Content-Type":"application/json"}}),t=e.headers.get("X-Skypilot-Request-ID")||e.headers.get("X-Request-ID"),r=await fetch("".concat(i.f4,"/api/get?request_id=").concat(t)),a=await r.json();s=a.return_value?JSON.parse(a.return_value):[],console.log("Enabled clouds:",s)}catch(e){console.error("Error fetching enabled clouds:",e),s=[]}let a={};i.$m.forEach(e=>{let t=s.includes(e);a[e]={name:e,clusters:0,jobs:0,enabled:t}}),(e||[]).forEach(e=>{if(e.cloud){let t=e.cloud;a[t]&&(a[t].clusters+=1,a[t].enabled=!0)}}),r.forEach(e=>{if(e.cloud){let t=e.cloud;a[t]&&(a[t].jobs+=1,a[t].enabled=!0)}});let n=i.$m.length,l=Object.values(a).filter(e=>e.enabled).length;return{clouds:Object.values(a).filter(e=>e.enabled).sort((e,t)=>t.clusters-e.clusters||t.jobs-e.jobs),totalClouds:n,enabledClouds:l}}catch(e){return console.error("Error fetching cloud infrastructure:",e),{clouds:[],totalClouds:i.$m.length,enabledClouds:0}}}async function h(){return await g()}async function f(){try{let e=await fetch("".concat(i.f4,"/realtime_kubernetes_gpu_availability"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({context:null,name_filter:null,quantity_filter:null})});if(!e.ok){if(422===e.status)return console.log("No GPU resources available in Kubernetes contexts"),[];return console.error("Error fetching Kubernetes context GPUs: ".concat(e.status," ").concat(e.statusText)),[]}let t=e.headers.get("X-Skypilot-Request-ID")||e.headers.get("x-request-id");if(!t)return console.error("No request ID returned for Kubernetes GPU availability"),[];let r=await fetch("".concat(i.f4,"/api/get?request_id=").concat(t));if(500===r.status){try{let e=await r.json();if(e.detail&&e.detail.error)try{let t=JSON.parse(e.detail.error);console.error("Error fetching Kubernetes context GPUs:",t.message)}catch(e){console.error("Error parsing JSON:",e)}}catch(e){console.error("Error parsing JSON:",e)}return[]}let s=await r.json();return s.return_value?JSON.parse(s.return_value):[]}catch(e){return console.error("Error fetching Kubernetes context GPUs:",e),[]}}async function x(e){try{let t=await fetch("".concat(i.f4,"/kubernetes_node_info"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({context:e})}),r=t.headers.get("X-Skypilot-Request-ID")||t.headers.get("x-request-id"),s=await fetch("".concat(i.f4,"/api/get?request_id=").concat(r));if(500===s.status){try{let e=await s.json();if(e.detail&&e.detail.error)try{let t=JSON.parse(e.detail.error);console.error("Error fetching Kubernetes per node GPUs:",t.message)}catch(e){console.error("Error parsing JSON:",e)}}catch(e){console.error("Error parsing JSON:",e)}return{}}let a=await s.json();return(a.return_value?JSON.parse(a.return_value):{}).node_info_dict||{}}catch(e){return console.error("Error fetching Kubernetes per node GPUs:",e),{}}}async function g(){try{let e=await f();if(!e||0===e.length)return console.log("No Kubernetes GPUs available"),{allGPUs:[],perContextGPUs:[],perNodeGPUs:[]};let t={},r={},s={};for(let a of e){let e=a[0],n=a[1];for(let s of(r[e]||(r[e]=[]),n)){let a=s[0],n=s[1].join(", "),l=s[2],o=s[3];a in t?(t[a].gpu_total+=l,t[a].gpu_free+=o):t[a]={gpu_total:l,gpu_free:o,gpu_name:a},r[e].push({gpu_name:a,gpu_requestable_qty_per_node:n,gpu_total:l,gpu_free:o,context:e})}let l=await x(e);for(let t in l)s["".concat(e,"/").concat(t)]={node_name:l[t].name,gpu_name:l[t].accelerator_type||"-",gpu_total:l[t].total.accelerator_count,gpu_free:l[t].free.accelerators_available,context:e}}return{allGPUs:Object.values(t).sort((e,t)=>e.gpu_name.localeCompare(t.gpu_name)),perContextGPUs:Object.values(r).flat().sort((e,t)=>e.context.localeCompare(t.context)||e.gpu_name.localeCompare(t.gpu_name)),perNodeGPUs:Object.values(s).sort((e,t)=>e.context.localeCompare(t.context)||e.node_name.localeCompare(t.node_name)||e.gpu_name.localeCompare(t.gpu_name))}}catch(e){return console.error("Error fetching Kubernetes GPUs:",e),{allGPUs:[],perContextGPUs:[],perNodeGPUs:[]}}}var p=r(1163),b=r(1664),j=r.n(b),y=r(7469);function _(){let[e,t]=(0,a.useState)(!0),[r,i]=(0,a.useState)(!0),[d,u]=(0,a.useState)(!0),f=a.useRef(null),x=(0,c.X)(),[g,b]=(0,a.useState)(!1),[_,N]=(0,a.useState)(!1),w=(0,p.useRouter)(),[v,C]=(0,a.useState)([]),[S,E]=(0,a.useState)([]),[P,O]=(0,a.useState)([]),[q,k]=(0,a.useState)([]),[U,J]=(0,a.useState)(0),[D,G]=(0,a.useState)(0),[T,I]=(0,a.useState)(null),R=a.useCallback(async()=>{t(!0),i(!0);try{let{allGPUs:e,perContextGPUs:t,perNodeGPUs:r}=await h();C(e||[]),E(t||[]),O(r||[]),b(!0)}catch(e){console.error("Error fetching Kubernetes data:",e),C([]),E([]),O([])}finally{t(!1)}try{let e=await m();k((null==e?void 0:e.clouds)||[]),J((null==e?void 0:e.totalClouds)||0),G((null==e?void 0:e.enabledClouds)||0),N(!0)}catch(e){console.error("Error fetching cloud infrastructure data:",e),k([]),J(0),G(0)}finally{i(!1),d&&u(!1)}},[d]);a.useEffect(()=>{f&&(f.current=R)},[f,R]),(0,a.useEffect)(()=>{let e=!0;R();let t=setInterval(()=>{e&&R()},6e4);return()=>{e=!1,clearInterval(t)}},[R]),v.length;let K=v.reduce((e,t)=>e+t.gpu_total,0),L=v.reduce((e,t)=>e+t.gpu_free,0),X=a.useMemo(()=>S?S.reduce((e,t)=>{let{context:r}=t;return e[r]||(e[r]=[]),e[r].push(t),e},{}):{},[S]),z=a.useMemo(()=>Object.keys(X),[X]),A=a.useMemo(()=>P?P.reduce((e,t)=>{let{context:r}=t;return e[r]||(e[r]=[]),e[r].push(t),e},{}):{},[P]);(0,a.useEffect)(()=>{w.query.context&&I(decodeURIComponent(Array.isArray(w.query.context)?w.query.context[0]:w.query.context))},[w.isReady,w.query]);let M=e=>{I(e),w.replace({pathname:"/infra",query:e?{context:e}:void 0},e?"/infra/".concat(encodeURIComponent(e)):"/infra",{shallow:!0})},F=()=>{I(null),w.replace({pathname:"/infra"},"/infra",{shallow:!0})};(0,a.useEffect)(()=>{let e=e=>{let t=e.match(/\/infra\/([^\/]+)$/);t?I(decodeURIComponent(t[1])):"/infra"===e&&I(null)};return w.events.on("routeChangeComplete",e),()=>{w.events.off("routeChangeComplete",e)}},[w.events]);let Z=e||r;return(0,s.jsxs)(l.A,{highlighted:"infra",children:[(0,s.jsxs)("div",{className:"flex items-center justify-between mb-4 h-5",children:[(0,s.jsxs)("div",{className:"text-base flex items-center",children:[(0,s.jsx)(j(),{href:"/infra",className:"text-sky-blue hover:underline ".concat(T?"":"cursor-default"),onClick:e=>{T&&(e.preventDefault(),F())},children:"Infrastructure"}),T&&(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)("span",{className:"mx-2 text-gray-500",children:"›"}),(0,s.jsx)("span",{className:"text-sky-blue hover:underline cursor-pointer",onClick:e=>{e.preventDefault(),F()},children:"Kubernetes"}),(0,s.jsx)("span",{className:"mx-2 text-gray-500",children:"›"}),(0,s.jsx)("span",{className:"text-sky-blue",children:T})]})]}),(0,s.jsxs)("div",{className:"flex items-center",children:[Z&&(0,s.jsxs)("div",{className:"flex items-center mr-2",children:[(0,s.jsx)(n.Z,{size:15,className:"mt-0"}),(0,s.jsx)("span",{className:"ml-2 text-gray-500",children:"Loading..."})]}),(0,s.jsxs)("button",{onClick:()=>{f.current&&(u(!1),f.current())},disabled:Z,className:"text-sky-blue hover:text-sky-blue-bright flex items-center",children:[(0,s.jsx)(o.Z,{className:"h-4 w-4 mr-1.5"}),!x&&"Refresh"]})]})]}),T?e&&!g?(0,s.jsxs)("div",{className:"flex flex-col items-center justify-center h-64",children:[(0,s.jsx)(n.Z,{size:32,className:"mb-4"}),(0,s.jsx)("span",{className:"text-gray-500 text-lg",children:"Loading Context..."})]}):(e=>{let t=X[e]||[],r=A[e]||[];return(0,s.jsx)("div",{className:"mb-4",children:(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm h-full",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h4",{className:"text-lg font-semibold mb-4",children:"Available GPUs"}),(0,s.jsx)("div",{className:"grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4 mb-6",children:t.map(e=>{let t=e.gpu_total-e.gpu_free,r=e.gpu_total>0?e.gpu_free/e.gpu_total*100:0,a=e.gpu_total>0?t/e.gpu_total*100:0;return(0,s.jsxs)("div",{className:"p-3 bg-gray-50 rounded-md border border-gray-200 shadow-sm",children:[(0,s.jsxs)("div",{className:"flex justify-between items-center mb-1.5 flex-wrap",children:[(0,s.jsxs)("div",{className:"font-medium text-gray-800 text-sm",children:[e.gpu_name,(0,s.jsxs)("span",{className:"text-xs text-gray-500 ml-2",children:["(Requestable: ",e.gpu_requestable_qty_per_node," / node)"]})]}),(0,s.jsxs)("span",{className:"text-xs font-medium",children:[e.gpu_free," free / ",e.gpu_total," total"]})]}),(0,s.jsxs)("div",{className:"w-full bg-gray-100 rounded-md h-4 flex overflow-hidden shadow-sm",children:[a>0&&(0,s.jsx)("div",{style:{width:"".concat(a,"%")},className:"bg-yellow-500 h-full flex items-center justify-center text-white text-xs",children:a>15&&"".concat(t," used")}),r>0&&(0,s.jsx)("div",{style:{width:"".concat(r,"%")},className:"bg-green-700 h-full flex items-center justify-center text-white text-xs",children:r>15&&"".concat(e.gpu_free," free")})]})]},e.gpu_name)})}),r.length>0&&(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)("h4",{className:"text-lg font-semibold mb-4",children:"Nodes"}),(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-100",children:(0,s.jsxs)("tr",{children:[(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600",children:"Node"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600",children:"GPU"}),(0,s.jsx)("th",{className:"p-3 text-right font-medium text-gray-600",children:"Availability"})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200",children:r.map((e,t)=>(0,s.jsxs)("tr",{className:"hover:bg-gray-50",children:[(0,s.jsx)("td",{className:"p-3 whitespace-nowrap text-gray-700",children:e.node_name}),(0,s.jsx)("td",{className:"p-3 whitespace-nowrap text-gray-700",children:e.gpu_name}),(0,s.jsx)("td",{className:"p-3 whitespace-nowrap text-right text-gray-700",children:"".concat(e.gpu_free," of ").concat(e.gpu_total," free")})]},"".concat(e.node_name,"-").concat(t)))})]})})]})]})})})})(T):(0,s.jsxs)(s.Fragment,{children:[e&&!g?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold mb-4",children:"Kubernetes"}),(0,s.jsxs)("div",{className:"flex items-center justify-center py-6",children:[(0,s.jsx)(n.Z,{size:24,className:"mr-3"}),(0,s.jsx)("span",{className:"text-gray-500",children:"Loading Kubernetes..."})]})]})}):g&&v.length>0?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsxs)("div",{className:"flex items-center mb-4",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold",children:"Kubernetes"}),(0,s.jsxs)("span",{className:"ml-2 px-2 py-0.5 bg-blue-100 text-blue-800 rounded-full text-xs font-medium",children:[z.length," ",1===z.length?"context":"contexts"]})]}),(0,s.jsxs)("div",{className:"grid grid-cols-1 md:grid-cols-2 gap-6",children:[(0,s.jsx)("div",{children:(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-50",children:(0,s.jsxs)("tr",{children:[(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/3",children:"Context"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/6",children:"Nodes"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/3",children:"GPU Types"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/6",children:"#GPUs"})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200 ".concat(z.length>5?"max-h-[250px] overflow-y-auto block":""),children:z.map(e=>{let t=X[e]||[],r=A[e]||[],a=t.reduce((e,t)=>e+(t.gpu_total||0),0),n=Object.keys(t.reduce((e,t)=>(e[t.gpu_name]=(e[t.gpu_name]||0)+(t.gpu_total||0),e),{})).join(", ");return(0,s.jsxs)("tr",{className:"hover:bg-gray-50",children:[(0,s.jsx)("td",{className:"p-3",children:(0,s.jsx)(y.Md,{content:e,className:"text-sm text-muted-foreground",children:(0,s.jsx)("span",{className:"text-blue-600 hover:underline cursor-pointer",onClick:()=>M(e),children:e.length>30?"".concat(e.substring(0,Math.floor(13.5)),"...").concat(e.substring(e.length-Math.ceil(13.5))):e})})}),(0,s.jsx)("td",{className:"p-3",children:r.length}),(0,s.jsx)("td",{className:"p-3",children:n}),(0,s.jsx)("td",{className:"p-3",children:a})]},e)})})]})})}),(0,s.jsx)("div",{children:(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-50",children:(0,s.jsxs)("tr",{children:[(0,s.jsxs)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/4 whitespace-nowrap",children:["GPU",(0,s.jsxs)("span",{className:"ml-2 px-2 py-0.5 bg-green-100 text-green-800 rounded-full text-xs font-medium whitespace-nowrap",children:[L," of ",K," free"]})]}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/4",children:"Requestable"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/2",children:(0,s.jsx)("div",{className:"flex items-center",children:(0,s.jsx)("span",{children:"Utilization"})})})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200 ".concat(v.length>5?"max-h-[250px] overflow-y-auto block":""),children:v.map(e=>{let t=e.gpu_total-e.gpu_free,r=e.gpu_total>0?e.gpu_free/e.gpu_total*100:0,a=e.gpu_total>0?t/e.gpu_total*100:0,n=S.filter(t=>t.gpu_name===e.gpu_name).map(e=>e.gpu_requestable_qty_per_node).filter((e,t,r)=>r.indexOf(e)===t).join(", ");return(0,s.jsxs)("tr",{children:[(0,s.jsx)("td",{className:"p-3 font-medium w-24 whitespace-nowrap",children:e.gpu_name}),(0,s.jsxs)("td",{className:"p-3 text-xs text-gray-600",children:[n||"-"," / node"]}),(0,s.jsx)("td",{className:"p-3 w-2/3",children:(0,s.jsx)("div",{className:"flex items-center gap-3",children:(0,s.jsxs)("div",{className:"flex-1 bg-gray-100 rounded-md h-5 flex overflow-hidden shadow-sm min-w-[100px] w-full",children:[a>0&&(0,s.jsx)("div",{style:{width:"".concat(a,"%")},className:"bg-yellow-500 h-full flex items-center justify-center text-white text-xs font-medium",children:a>15&&"".concat(t," used")}),r>0&&(0,s.jsx)("div",{style:{width:"".concat(r,"%")},className:"bg-green-700 h-full flex items-center justify-center text-white text-xs font-medium",children:r>15&&"".concat(e.gpu_free," free")})]})})})]},e.gpu_name)})})]})})})]})]})}):g&&0===v.length?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold mb-4",children:"Kubernetes Infrastructure"}),(0,s.jsx)("p",{className:"text-sm text-gray-500",children:"No Kubernetes GPUs found or Kubernetes is not configured."})]})}):null,r&&!_?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold mb-4",children:"Cloud"}),(0,s.jsxs)("div",{className:"flex items-center justify-center py-6",children:[(0,s.jsx)(n.Z,{size:24,className:"mr-3"}),(0,s.jsx)("span",{className:"text-gray-500",children:"Loading Cloud..."})]})]})}):(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsxs)("div",{className:"flex items-center mb-4",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold",children:"Cloud"}),(0,s.jsxs)("span",{className:"ml-2 px-2 py-0.5 bg-blue-100 text-blue-800 rounded-full text-xs font-medium",children:[D," of ",U," enabled"]})]}),0===q.length?(0,s.jsx)("p",{className:"text-sm text-gray-500",children:"No enabled clouds available."}):(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-50",children:(0,s.jsxs)("tr",{children:[(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-32",children:"Cloud"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-24",children:"Clusters"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-24",children:"Jobs"})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200",children:q.map(e=>(0,s.jsxs)("tr",{className:"hover:bg-gray-50",children:[(0,s.jsx)("td",{className:"p-3 font-medium text-gray-700",children:e.name}),(0,s.jsx)("td",{className:"p-3",children:e.clusters>0?(0,s.jsx)("span",{className:"px-2 py-0.5 bg-blue-100 text-blue-800 rounded text-xs font-medium",children:e.clusters}):(0,s.jsx)("span",{className:"px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium",children:"0"})}),(0,s.jsx)("td",{className:"p-3",children:e.jobs>0?(0,s.jsx)("span",{className:"px-2 py-0.5 bg-green-100 text-green-800 rounded text-xs font-medium",children:e.jobs}):(0,s.jsx)("span",{className:"px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium",children:"0"})})]},e.name))})]})})]})})]})]})}var N=r(9008),w=r.n(N);function v(){return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(w(),{children:(0,s.jsx)("title",{children:"Infra | SkyPilot Dashboard"})}),(0,s.jsx)(_,{})]})}}},function(e){e.O(0,[582,480,888,774,179],function(){return e(e.s=6206)}),_N_E=e.O()}]);
@@ -1,308 +0,0 @@
1
- #!/bin/bash
2
- # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
3
- set -e
4
-
5
- # Colors for nicer UX
6
- RED='\033[0;31m'
7
- GREEN='\033[0;32m'
8
- YELLOW='\033[1;33m'
9
- NC='\033[0m' # No color
10
-
11
- # Variables
12
- CLEANUP=false
13
- INSTALL_GPU=false
14
- POSITIONAL_ARGS=()
15
- PASSWORD=""
16
-
17
- # Process all arguments
18
- while [[ $# -gt 0 ]]; do
19
- case $1 in
20
- --cleanup)
21
- CLEANUP=true
22
- shift
23
- ;;
24
- --password)
25
- PASSWORD=$2
26
- shift
27
- shift
28
- ;;
29
- *)
30
- POSITIONAL_ARGS+=("$1")
31
- shift
32
- ;;
33
- esac
34
- done
35
-
36
- # Restore positional arguments in correct order
37
- set -- "${POSITIONAL_ARGS[@]}"
38
-
39
- # Assign positional arguments to variables
40
- IPS_FILE=$1
41
- USER=$2
42
- SSH_KEY=$3
43
- CONTEXT_NAME=${4:-default}
44
- K3S_TOKEN=mytoken # Any string can be used as the token
45
- # Create temporary askpass script for sudo
46
- ASKPASS_BLOCK="# Create temporary askpass script
47
- ASKPASS_SCRIPT=\$(mktemp)
48
- trap 'rm -f \$ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
49
- cat > \$ASKPASS_SCRIPT << EOF
50
- #!/bin/bash
51
- echo $PASSWORD
52
- EOF
53
- chmod 700 \$ASKPASS_SCRIPT
54
- # Use askpass
55
- export SUDO_ASKPASS=\$ASKPASS_SCRIPT
56
- "
57
-
58
- # Basic argument checks
59
- if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
60
- >&2 echo -e "${RED}Error: Missing required arguments.${NC}"
61
- >&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup] [--password password]"
62
- exit 1
63
- fi
64
-
65
- # Check if SSH key exists
66
- if [ ! -f "$SSH_KEY" ]; then
67
- >&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
68
- exit 1
69
- fi
70
-
71
- # Check if IPs file exists
72
- if [ ! -f "$IPS_FILE" ]; then
73
- >&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
74
- exit 1
75
- fi
76
-
77
- # Get head node and worker nodes from the IPs file
78
- HEAD_NODE=$(head -n 1 "$IPS_FILE")
79
- WORKER_NODES=$(tail -n +2 "$IPS_FILE")
80
-
81
- # Check if the IPs file is empty or not formatted correctly
82
- if [ -z "$HEAD_NODE" ]; then
83
- >&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
84
- exit 1
85
- fi
86
-
87
- # Function to show a progress message
88
- progress_message() {
89
- echo -e "${YELLOW}➜ $1${NC}"
90
- }
91
-
92
- # Step to display success
93
- success_message() {
94
- echo -e "${GREEN}✔ $1${NC}"
95
- }
96
-
97
- # Function to run a command on a remote machine via SSH
98
- run_remote() {
99
- local NODE_IP=$1
100
- local CMD=$2
101
- # echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
102
- ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
103
- }
104
-
105
- # Function to uninstall k3s and clean up the state on a remote machine
106
- cleanup_server_node() {
107
- local NODE_IP=$1
108
- echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
109
- run_remote "$NODE_IP" "
110
- $ASKPASS_BLOCK
111
- echo 'Uninstalling k3s...' &&
112
- sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
113
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
114
- "
115
- echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
116
- }
117
-
118
- # Function to uninstall k3s and clean up the state on a remote machine
119
- cleanup_agent_node() {
120
- local NODE_IP=$1
121
- echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
122
- run_remote "$NODE_IP" "
123
- $ASKPASS_BLOCK
124
- echo 'Uninstalling k3s...' &&
125
- sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
126
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
127
- "
128
- echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
129
- }
130
-
131
- check_gpu() {
132
- local NODE_IP=$1
133
- if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
134
- return 0 # GPU detected
135
- else
136
- return 1 # No GPU detected
137
- fi
138
- }
139
-
140
- # Pre-flight checks
141
- run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
142
- # TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
143
-
144
- # If --cleanup flag is set, uninstall k3s and exit
145
- if [ "$CLEANUP" == "true" ]; then
146
- echo -e "${YELLOW}Starting cleanup...${NC}"
147
-
148
- # Clean up head node
149
- cleanup_server_node "$HEAD_NODE"
150
-
151
- # Clean up worker nodes
152
- for NODE in $WORKER_NODES; do
153
- cleanup_agent_node "$NODE"
154
- done
155
-
156
- # Remove the context from local kubeconfig if it exists
157
- if [ -f "$HOME/.kube/config" ]; then
158
- progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
159
- kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
160
- kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
161
- kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
162
- # Update the current context to the first available context
163
- kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
164
- success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
165
- fi
166
-
167
- echo -e "${GREEN}Cleanup completed successfully.${NC}"
168
- exit 0
169
- fi
170
-
171
- # Step 1: Install k3s on the head node
172
- progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
173
- run_remote "$HEAD_NODE" "
174
- $ASKPASS_BLOCK
175
- curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sudo -E -A sh - &&
176
- mkdir -p ~/.kube &&
177
- sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
178
- sudo -A chown \$(id -u):\$(id -g) ~/.kube/config &&
179
- for i in {1..3}; do
180
- if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
181
- break
182
- else
183
- echo 'Waiting for nodes to be ready...'
184
- sleep 5
185
- fi
186
- done
187
- if [ \$i -eq 3 ]; then
188
- echo 'Failed to wait for nodes to be ready after 3 attempts'
189
- exit 1
190
- fi"
191
- success_message "K3s deployed on head node."
192
-
193
- # Check if head node has a GPU
194
- if check_gpu "$HEAD_NODE"; then
195
- echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
196
- INSTALL_GPU=true
197
- fi
198
-
199
- # Fetch the head node's internal IP (this will be passed to worker nodes)
200
- MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
201
-
202
- echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
203
-
204
- # Step 2: Install k3s on worker nodes and join them to the master node
205
- for NODE in $WORKER_NODES; do
206
- progress_message "Deploying Kubernetes on worker node ($NODE)..."
207
- run_remote "$NODE" "
208
- $ASKPASS_BLOCK
209
- curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sudo -E -A sh -"
210
- success_message "Kubernetes deployed on worker node ($NODE)."
211
-
212
- # Check if worker node has a GPU
213
- if check_gpu "$NODE"; then
214
- echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
215
- INSTALL_GPU=true
216
- fi
217
- done
218
- # Step 3: Configure local kubectl to connect to the cluster
219
- progress_message "Configuring local kubectl to connect to the cluster..."
220
-
221
- # Create temporary directory for kubeconfig operations
222
- TEMP_DIR=$(mktemp -d)
223
- TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
224
-
225
- # Get the kubeconfig from remote server
226
- scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
227
-
228
- # Create .kube directory if it doesn't exist
229
- mkdir -p "$HOME/.kube"
230
-
231
- # Create empty kubeconfig if it doesn't exist
232
- KUBECONFIG_FILE="$HOME/.kube/config"
233
- if [[ ! -f "$KUBECONFIG_FILE" ]]; then
234
- touch "$KUBECONFIG_FILE"
235
- fi
236
-
237
- # Modify the temporary kubeconfig to update server address and context name
238
- awk -v context="$CONTEXT_NAME" '
239
- /^clusters:/ { in_cluster = 1 }
240
- /^users:/ { in_cluster = 0 }
241
- in_cluster && /^ *certificate-authority-data:/ { next }
242
- in_cluster && /^ *server:/ {
243
- print " server: https://'${HEAD_NODE}:6443'"
244
- print " insecure-skip-tls-verify: true"
245
- next
246
- }
247
- /name: default/ { sub("name: default", "name: " context) }
248
- /cluster: default/ { sub("cluster: default", "cluster: " context) }
249
- /user: default/ { sub("user: default", "user: " context) }
250
- /current-context: default/ { sub("current-context: default", "current-context: " context) }
251
- { print }
252
- ' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
253
-
254
- # Merge the configurations using kubectl
255
- KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
256
- mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
257
-
258
- # Set the new context as the current context
259
- kubectl config use-context "$CONTEXT_NAME"
260
-
261
- # Clean up temporary files
262
- rm -rf "$TEMP_DIR"
263
-
264
- success_message "kubectl configured with new context '$CONTEXT_NAME'."
265
-
266
- echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
267
-
268
- # Install GPU operator if a GPU was detected on any node
269
- if [ "$INSTALL_GPU" == "true" ]; then
270
- echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
271
- run_remote "$HEAD_NODE" "
272
- $ASKPASS_BLOCK
273
- curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
274
- chmod 700 get_helm.sh &&
275
- ./get_helm.sh &&
276
- helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
277
- kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
278
- sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
279
- helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
280
- --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
281
- --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
282
- --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
283
- --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
284
- --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
285
- --set 'toolkit.env[2].value=nvidia' &&
286
- echo 'Waiting for GPU operator installation...' &&
287
- while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
288
- echo 'Waiting for GPU operator...'
289
- sleep 5
290
- done
291
- echo 'GPU operator installed successfully.'"
292
- success_message "GPU Operator installed."
293
- else
294
- echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
295
- fi
296
-
297
- # Configure SkyPilot
298
- progress_message "Configuring SkyPilot..."
299
- sky check kubernetes
300
- success_message "SkyPilot configured successfully."
301
-
302
- # Display final success message
303
- echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
304
- echo "You can now interact with your Kubernetes cluster through SkyPilot: "
305
- echo " • List available GPUs: sky show-gpus --cloud kubernetes"
306
- echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
307
- echo " • Connect to pod with SSH: ssh devbox"
308
- echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"