skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +50 -2
- sky/clouds/__init__.py +3 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +24 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +4 -2
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +53 -9
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/optimizer.py +23 -4
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +4 -1
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +40 -0
- sky/setup_files/dependencies.py +1 -0
- sky/templates/nebius-ray.yml.j2 +12 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/schemas.py +21 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +68 -63
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -1 +0,0 @@
|
|
1
|
-
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[588],{6206:function(e,t,r){(window.__NEXT_P=window.__NEXT_P||[]).push(["/infra",function(){return r(1090)}])},3266:function(e,t,r){"use strict";r.d(t,{QL:function(){return d},Sl:function(){return c},zd:function(){return o}});var s=r(7294),a=r(5821),n=r(3225);let l={UP:"RUNNING",STOPPED:"STOPPED",INIT:"LAUNCHING",null:"TERMINATED"};async function o(){let{clusterNames:e=null}=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};try{let t=await fetch("".concat(n.f4,"/status"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({cluster_names:e,all_users:!0})}),r=t.headers.get("X-Skypilot-Request-ID")||t.headers.get("X-Request-ID"),s=await fetch("".concat(n.f4,"/api/get?request_id=").concat(r)),a=await s.json();return(a.return_value?JSON.parse(a.return_value):[]).map(e=>{let t="",r=t=e.zone?e.zone:e.region;return t&&t.length>25&&(t=function(e){let t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:15;if(!e||e.length<=t)return e;if(t<=3)return"...";let r=Math.floor((t-3)/2),s=r+(t-3)%2;return 0===r?e.substring(0,s)+"...":e.substring(0,s)+"..."+e.substring(e.length-r)}(t,25)),{status:l[e.status],cluster:e.name,user:e.user_name,cloud:e.cloud,infra:t?e.cloud+" ("+t+")":e.cloud,full_infra:r?"".concat(e.cloud," (").concat(r,")"):e.cloud,cpus:e.cpus,mem:e.memory,gpus:e.accelerators,resources_str:e.resources_str,resources_str_full:e.resources_str_full,time:new Date(1e3*e.launched_at),num_nodes:e.nodes,jobs:[],events:[{time:new Date(1e3*e.launched_at),event:"Cluster created."}]}})}catch(e){return console.error("Error fetching clusters:",e),[]}}async function c(e){let{clusterName:t,jobId:r,onNewLog:s}=e;try{let e=(await fetch("".concat(n.f4,"/logs"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({follow:!1,cluster_name:t,job_id:r})})).body.getReader();for(;;){let{done:t,value:r}=await e.read();if(t)break;let a=new TextDecoder().decode(r);s(a)}}catch(e){console.error("Error in streamClusterJobLogs:",e),(0,a.C)("Error in streamClusterJobLogs: ".concat(e.message),"error")}}async function i(e){let{clusterName:t}=e;try{let e=await fetch("".concat(n.f4,"/queue"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({cluster_name:t,all_users:!0})}),r=e.headers.get("X-Skypilot-Request-ID")||e.headers.get("X-Request-ID"),s=await fetch("".concat(n.f4,"/api/get?request_id=").concat(r)),a=await s.json();return JSON.parse(a.return_value).map(e=>{let r=e.end_at?e.end_at:Date.now()/1e3,s=0,a=0;return e.submitted_at&&(s=r-e.submitted_at),e.start_at&&(a=r-e.start_at),{id:e.job_id,status:e.status,job:e.job_name,user:e.username,gpus:e.accelerators||{},submitted_at:e.submitted_at?new Date(1e3*e.submitted_at):null,resources:e.resources,cluster:t,total_duration:s,job_duration:a,infra:"",logs:""}})}catch(e){return console.error("Error fetching cluster jobs:",e),[]}}function d(e){let{cluster:t,job:r=null}=e,[a,n]=(0,s.useState)(null),[l,c]=(0,s.useState)(null),[d,u]=(0,s.useState)(!0),[m,h]=(0,s.useState)(!0),f=(0,s.useCallback)(async()=>{if(t)try{u(!0);let e=await o({clusterNames:[t]});n(e[0])}catch(e){console.error("Error fetching cluster data:",e)}finally{u(!1)}},[t]),x=(0,s.useCallback)(async()=>{if(t)try{h(!0);let e=await i({clusterName:t});c(e)}catch(e){console.error("Error fetching cluster job data:",e)}finally{h(!1)}},[t]),g=(0,s.useCallback)(async()=>{await Promise.all([f(),x()])},[f,x]);return(0,s.useEffect)(()=>{f(),x()},[t,r,f,x]),{clusterData:a,clusterJobData:l,loading:d||m,refreshData:g}}},8969:function(e,t,r){"use strict";r.d(t,{Ce:function(){return i},NJ:function(){return c},Pr:function(){return o},Vp:function(){return l}});var s=r(7294),a=r(5821),n=r(3225);async function l(){let{allUsers:e=!0}=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};try{let t=(await fetch("".concat(n.f4,"/jobs/queue"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({all_users:e})})).headers.get("X-Skypilot-Request-ID"),r=await fetch("".concat(n.f4,"/api/get?request_id=").concat(t));if(500===r.status){try{let e=await r.json();if(e.detail&&e.detail.error)try{let t=JSON.parse(e.detail.error);if(t.type&&t.type===n.iW)return{jobs:[],controllerStopped:!0}}catch(e){console.error("Error parsing JSON:",e)}}catch(e){console.error("Error parsing JSON:",e)}return{jobs:[],controllerStopped:!1}}let s=await r.json();return{jobs:(s.return_value?JSON.parse(s.return_value):[]).map(e=>{let t=[];e.submitted_at&&t.push({time:new Date(1e3*e.submitted_at),event:"Job submitted."}),e.start_at&&t.push({time:new Date(1e3*e.start_at),event:"Job started."}),e.end_at&&("CANCELLING"==e.status||"CANCELLED"==e.status?t.push({time:new Date(1e3*e.end_at),event:"Job cancelled."}):t.push({time:new Date(1e3*e.end_at),event:"Job completed."})),e.last_recovered_at&&e.last_recovered_at!=e.start_at&&t.push({time:new Date(1e3*e.last_recovered_at),event:"Job recovered."});let r=(e.end_at?e.end_at:Date.now()/1e3)-e.submitted_at,s=e.cloud,a=e.cluster_resources;if(!s){if(e.cluster_resources&&"-"!==e.cluster_resources)try{s=e.cluster_resources.split("(")[0].split("x").pop().trim(),a=e.cluster_resources.replace("".concat(s,"("),"(").replace("x ","x")}catch(e){s="Unknown"}else s="Unknown"}let n="",l=n=e.zone?e.zone:e.region;n&&n.length>15&&(n=n.substring(0,15)+"...");let o=s+" ("+n+")";"-"===n&&(o=s);let c=s+" ("+l+")";return"-"===l&&(c=s),{id:e.job_id,task:e.task_name,name:e.job_name,job_duration:e.job_duration,total_duration:r,status:e.status,requested_resources:e.resources,resources_str:a,resources_str_full:e.cluster_resources_full||a,cloud:s,infra:o,full_infra:c,recoveries:e.recovery_count,details:e.failure_reason,user:e.user_name,submitted_at:e.submitted_at?new Date(1e3*e.submitted_at):null,events:t}}),controllerStopped:!1}}catch(e){return console.error("Error fetching managed job data:",e),{jobs:[],controllerStopped:!1}}}function o(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:0,[t,r]=(0,s.useState)(null),[a,n]=(0,s.useState)(!0);return(0,s.useEffect)(()=>{(async function(){try{n(!0);let e=await l({allUsers:!0});r(e)}catch(e){console.error("Error fetching managed job data:",e)}finally{n(!1)}})()},[e]),{jobData:t,loading:a}}async function c(e){let{jobId:t,controller:r=!1,signal:s,onNewLog:l}=e,o=new Promise(e=>{setTimeout(()=>{e({timeout:!0})},1e4)}),c=(async()=>{try{let e=(await fetch("".concat(n.f4,"/jobs/logs"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({controller:r,follow:!1,job_id:t}),...s?{signal:s}:{}})).body.getReader();try{for(;;){let{done:t,value:r}=await e.read();if(t)break;let s=new TextDecoder().decode(r);l(s)}}finally{e.cancel()}return{timeout:!1}}catch(e){if("AbortError"===e.name)return{timeout:!1};throw e}})();if((await Promise.race([c,o])).timeout){(0,a.C)("Log request for job ".concat(t," timed out after ").concat(1e4,"ms"),"error");return}}async function i(e,t,r){let s="",l="",o="",c={};if("restartcontroller"===e)s="Restarting",l="restarted",o="jobs/queue",c={all_users:!0,refresh:!0},t="controller";else throw Error("Invalid action: ".concat(e));(0,a.C)("".concat(s," job ").concat(t,"..."),"info");try{try{let e=(await fetch("".concat(n.f4,"/").concat(o),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(c)})).headers.get("X-Skypilot-Request-ID"),i=await fetch("".concat(n.f4,"/api/get?request_id=").concat(e));if(200===i.status)(0,a.C)("Job ".concat(t," ").concat(l," successfully."),"success");else if(500===i.status)try{let e=await i.json();if(e.detail&&e.detail.error)try{let l=JSON.parse(e.detail.error);l.type&&l.type===n.Bo?(0,a.C)("".concat(s," job ").concat(t," is not supported!"),"error",1e4):l.type&&l.type===n.mF?(0,a.C)("Cluster ".concat(r," does not exist."),"error"):l.type&&l.type===n.iW?(0,a.C)("Cluster ".concat(r," is not up."),"error"):(0,a.C)("".concat(s," job ").concat(t," failed: ").concat(l.type),"error")}catch(r){(0,a.C)("".concat(s," job ").concat(t," failed: ").concat(e.detail.error),"error")}else(0,a.C)("".concat(s," job ").concat(t," failed with no details."),"error")}catch(e){(0,a.C)("".concat(s," job ").concat(t," failed with parse error."),"error")}else(0,a.C)("".concat(s," job ").concat(t," failed with status ").concat(i.status,"."),"error")}catch(e){console.error("Fetch error:",e),(0,a.C)("Network error ".concat(s," job ").concat(t,": ").concat(e.message),"error")}}catch(e){console.error("Error in handleStop:",e),(0,a.C)("Critical error ".concat(s," job ").concat(t,": ").concat(e.message),"error")}}},1090:function(e,t,r){"use strict";r.r(t),r.d(t,{default:function(){return v}});var s=r(5893),a=r(7294),n=r(8799),l=r(9470),o=r(3626),c=r(3001),i=r(3225),d=r(3266),u=r(8969);async function m(){try{let[e,t]=await Promise.all([(0,d.zd)(),(0,u.Vp)()]),r=(null==t?void 0:t.jobs)||[],s=[];try{let e=await fetch("".concat(i.f4,"/enabled_clouds"),{method:"GET",headers:{"Content-Type":"application/json"}}),t=e.headers.get("X-Skypilot-Request-ID")||e.headers.get("X-Request-ID"),r=await fetch("".concat(i.f4,"/api/get?request_id=").concat(t)),a=await r.json();s=a.return_value?JSON.parse(a.return_value):[],console.log("Enabled clouds:",s)}catch(e){console.error("Error fetching enabled clouds:",e),s=[]}let a={};i.$m.forEach(e=>{let t=s.includes(e);a[e]={name:e,clusters:0,jobs:0,enabled:t}}),(e||[]).forEach(e=>{if(e.cloud){let t=e.cloud;a[t]&&(a[t].clusters+=1,a[t].enabled=!0)}}),r.forEach(e=>{if(e.cloud){let t=e.cloud;a[t]&&(a[t].jobs+=1,a[t].enabled=!0)}});let n=i.$m.length,l=Object.values(a).filter(e=>e.enabled).length;return{clouds:Object.values(a).filter(e=>e.enabled).sort((e,t)=>t.clusters-e.clusters||t.jobs-e.jobs),totalClouds:n,enabledClouds:l}}catch(e){return console.error("Error fetching cloud infrastructure:",e),{clouds:[],totalClouds:i.$m.length,enabledClouds:0}}}async function h(){return await g()}async function f(){try{let e=await fetch("".concat(i.f4,"/realtime_kubernetes_gpu_availability"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({context:null,name_filter:null,quantity_filter:null})});if(!e.ok){if(422===e.status)return console.log("No GPU resources available in Kubernetes contexts"),[];return console.error("Error fetching Kubernetes context GPUs: ".concat(e.status," ").concat(e.statusText)),[]}let t=e.headers.get("X-Skypilot-Request-ID")||e.headers.get("x-request-id");if(!t)return console.error("No request ID returned for Kubernetes GPU availability"),[];let r=await fetch("".concat(i.f4,"/api/get?request_id=").concat(t));if(500===r.status){try{let e=await r.json();if(e.detail&&e.detail.error)try{let t=JSON.parse(e.detail.error);console.error("Error fetching Kubernetes context GPUs:",t.message)}catch(e){console.error("Error parsing JSON:",e)}}catch(e){console.error("Error parsing JSON:",e)}return[]}let s=await r.json();return s.return_value?JSON.parse(s.return_value):[]}catch(e){return console.error("Error fetching Kubernetes context GPUs:",e),[]}}async function x(e){try{let t=await fetch("".concat(i.f4,"/kubernetes_node_info"),{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({context:e})}),r=t.headers.get("X-Skypilot-Request-ID")||t.headers.get("x-request-id"),s=await fetch("".concat(i.f4,"/api/get?request_id=").concat(r));if(500===s.status){try{let e=await s.json();if(e.detail&&e.detail.error)try{let t=JSON.parse(e.detail.error);console.error("Error fetching Kubernetes per node GPUs:",t.message)}catch(e){console.error("Error parsing JSON:",e)}}catch(e){console.error("Error parsing JSON:",e)}return{}}let a=await s.json();return(a.return_value?JSON.parse(a.return_value):{}).node_info_dict||{}}catch(e){return console.error("Error fetching Kubernetes per node GPUs:",e),{}}}async function g(){try{let e=await f();if(!e||0===e.length)return console.log("No Kubernetes GPUs available"),{allGPUs:[],perContextGPUs:[],perNodeGPUs:[]};let t={},r={},s={};for(let a of e){let e=a[0],n=a[1];for(let s of(r[e]||(r[e]=[]),n)){let a=s[0],n=s[1].join(", "),l=s[2],o=s[3];a in t?(t[a].gpu_total+=l,t[a].gpu_free+=o):t[a]={gpu_total:l,gpu_free:o,gpu_name:a},r[e].push({gpu_name:a,gpu_requestable_qty_per_node:n,gpu_total:l,gpu_free:o,context:e})}let l=await x(e);for(let t in l)s["".concat(e,"/").concat(t)]={node_name:l[t].name,gpu_name:l[t].accelerator_type||"-",gpu_total:l[t].total.accelerator_count,gpu_free:l[t].free.accelerators_available,context:e}}return{allGPUs:Object.values(t).sort((e,t)=>e.gpu_name.localeCompare(t.gpu_name)),perContextGPUs:Object.values(r).flat().sort((e,t)=>e.context.localeCompare(t.context)||e.gpu_name.localeCompare(t.gpu_name)),perNodeGPUs:Object.values(s).sort((e,t)=>e.context.localeCompare(t.context)||e.node_name.localeCompare(t.node_name)||e.gpu_name.localeCompare(t.gpu_name))}}catch(e){return console.error("Error fetching Kubernetes GPUs:",e),{allGPUs:[],perContextGPUs:[],perNodeGPUs:[]}}}var p=r(1163),b=r(1664),j=r.n(b),y=r(7469);function _(){let[e,t]=(0,a.useState)(!0),[r,i]=(0,a.useState)(!0),[d,u]=(0,a.useState)(!0),f=a.useRef(null),x=(0,c.X)(),[g,b]=(0,a.useState)(!1),[_,N]=(0,a.useState)(!1),w=(0,p.useRouter)(),[v,C]=(0,a.useState)([]),[S,E]=(0,a.useState)([]),[P,O]=(0,a.useState)([]),[q,k]=(0,a.useState)([]),[U,J]=(0,a.useState)(0),[D,G]=(0,a.useState)(0),[T,I]=(0,a.useState)(null),R=a.useCallback(async()=>{t(!0),i(!0);try{let{allGPUs:e,perContextGPUs:t,perNodeGPUs:r}=await h();C(e||[]),E(t||[]),O(r||[]),b(!0)}catch(e){console.error("Error fetching Kubernetes data:",e),C([]),E([]),O([])}finally{t(!1)}try{let e=await m();k((null==e?void 0:e.clouds)||[]),J((null==e?void 0:e.totalClouds)||0),G((null==e?void 0:e.enabledClouds)||0),N(!0)}catch(e){console.error("Error fetching cloud infrastructure data:",e),k([]),J(0),G(0)}finally{i(!1),d&&u(!1)}},[d]);a.useEffect(()=>{f&&(f.current=R)},[f,R]),(0,a.useEffect)(()=>{let e=!0;R();let t=setInterval(()=>{e&&R()},6e4);return()=>{e=!1,clearInterval(t)}},[R]),v.length;let K=v.reduce((e,t)=>e+t.gpu_total,0),L=v.reduce((e,t)=>e+t.gpu_free,0),X=a.useMemo(()=>S?S.reduce((e,t)=>{let{context:r}=t;return e[r]||(e[r]=[]),e[r].push(t),e},{}):{},[S]),z=a.useMemo(()=>Object.keys(X),[X]),A=a.useMemo(()=>P?P.reduce((e,t)=>{let{context:r}=t;return e[r]||(e[r]=[]),e[r].push(t),e},{}):{},[P]);(0,a.useEffect)(()=>{w.query.context&&I(decodeURIComponent(Array.isArray(w.query.context)?w.query.context[0]:w.query.context))},[w.isReady,w.query]);let M=e=>{I(e),w.replace({pathname:"/infra",query:e?{context:e}:void 0},e?"/infra/".concat(encodeURIComponent(e)):"/infra",{shallow:!0})},F=()=>{I(null),w.replace({pathname:"/infra"},"/infra",{shallow:!0})};(0,a.useEffect)(()=>{let e=e=>{let t=e.match(/\/infra\/([^\/]+)$/);t?I(decodeURIComponent(t[1])):"/infra"===e&&I(null)};return w.events.on("routeChangeComplete",e),()=>{w.events.off("routeChangeComplete",e)}},[w.events]);let Z=e||r;return(0,s.jsxs)(l.A,{highlighted:"infra",children:[(0,s.jsxs)("div",{className:"flex items-center justify-between mb-4 h-5",children:[(0,s.jsxs)("div",{className:"text-base flex items-center",children:[(0,s.jsx)(j(),{href:"/infra",className:"text-sky-blue hover:underline ".concat(T?"":"cursor-default"),onClick:e=>{T&&(e.preventDefault(),F())},children:"Infrastructure"}),T&&(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)("span",{className:"mx-2 text-gray-500",children:"›"}),(0,s.jsx)("span",{className:"text-sky-blue hover:underline cursor-pointer",onClick:e=>{e.preventDefault(),F()},children:"Kubernetes"}),(0,s.jsx)("span",{className:"mx-2 text-gray-500",children:"›"}),(0,s.jsx)("span",{className:"text-sky-blue",children:T})]})]}),(0,s.jsxs)("div",{className:"flex items-center",children:[Z&&(0,s.jsxs)("div",{className:"flex items-center mr-2",children:[(0,s.jsx)(n.Z,{size:15,className:"mt-0"}),(0,s.jsx)("span",{className:"ml-2 text-gray-500",children:"Loading..."})]}),(0,s.jsxs)("button",{onClick:()=>{f.current&&(u(!1),f.current())},disabled:Z,className:"text-sky-blue hover:text-sky-blue-bright flex items-center",children:[(0,s.jsx)(o.Z,{className:"h-4 w-4 mr-1.5"}),!x&&"Refresh"]})]})]}),T?e&&!g?(0,s.jsxs)("div",{className:"flex flex-col items-center justify-center h-64",children:[(0,s.jsx)(n.Z,{size:32,className:"mb-4"}),(0,s.jsx)("span",{className:"text-gray-500 text-lg",children:"Loading Context..."})]}):(e=>{let t=X[e]||[],r=A[e]||[];return(0,s.jsx)("div",{className:"mb-4",children:(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm h-full",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h4",{className:"text-lg font-semibold mb-4",children:"Available GPUs"}),(0,s.jsx)("div",{className:"grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4 mb-6",children:t.map(e=>{let t=e.gpu_total-e.gpu_free,r=e.gpu_total>0?e.gpu_free/e.gpu_total*100:0,a=e.gpu_total>0?t/e.gpu_total*100:0;return(0,s.jsxs)("div",{className:"p-3 bg-gray-50 rounded-md border border-gray-200 shadow-sm",children:[(0,s.jsxs)("div",{className:"flex justify-between items-center mb-1.5 flex-wrap",children:[(0,s.jsxs)("div",{className:"font-medium text-gray-800 text-sm",children:[e.gpu_name,(0,s.jsxs)("span",{className:"text-xs text-gray-500 ml-2",children:["(Requestable: ",e.gpu_requestable_qty_per_node," / node)"]})]}),(0,s.jsxs)("span",{className:"text-xs font-medium",children:[e.gpu_free," free / ",e.gpu_total," total"]})]}),(0,s.jsxs)("div",{className:"w-full bg-gray-100 rounded-md h-4 flex overflow-hidden shadow-sm",children:[a>0&&(0,s.jsx)("div",{style:{width:"".concat(a,"%")},className:"bg-yellow-500 h-full flex items-center justify-center text-white text-xs",children:a>15&&"".concat(t," used")}),r>0&&(0,s.jsx)("div",{style:{width:"".concat(r,"%")},className:"bg-green-700 h-full flex items-center justify-center text-white text-xs",children:r>15&&"".concat(e.gpu_free," free")})]})]},e.gpu_name)})}),r.length>0&&(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)("h4",{className:"text-lg font-semibold mb-4",children:"Nodes"}),(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-100",children:(0,s.jsxs)("tr",{children:[(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600",children:"Node"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600",children:"GPU"}),(0,s.jsx)("th",{className:"p-3 text-right font-medium text-gray-600",children:"Availability"})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200",children:r.map((e,t)=>(0,s.jsxs)("tr",{className:"hover:bg-gray-50",children:[(0,s.jsx)("td",{className:"p-3 whitespace-nowrap text-gray-700",children:e.node_name}),(0,s.jsx)("td",{className:"p-3 whitespace-nowrap text-gray-700",children:e.gpu_name}),(0,s.jsx)("td",{className:"p-3 whitespace-nowrap text-right text-gray-700",children:"".concat(e.gpu_free," of ").concat(e.gpu_total," free")})]},"".concat(e.node_name,"-").concat(t)))})]})})]})]})})})})(T):(0,s.jsxs)(s.Fragment,{children:[e&&!g?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold mb-4",children:"Kubernetes"}),(0,s.jsxs)("div",{className:"flex items-center justify-center py-6",children:[(0,s.jsx)(n.Z,{size:24,className:"mr-3"}),(0,s.jsx)("span",{className:"text-gray-500",children:"Loading Kubernetes..."})]})]})}):g&&v.length>0?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsxs)("div",{className:"flex items-center mb-4",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold",children:"Kubernetes"}),(0,s.jsxs)("span",{className:"ml-2 px-2 py-0.5 bg-blue-100 text-blue-800 rounded-full text-xs font-medium",children:[z.length," ",1===z.length?"context":"contexts"]})]}),(0,s.jsxs)("div",{className:"grid grid-cols-1 md:grid-cols-2 gap-6",children:[(0,s.jsx)("div",{children:(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-50",children:(0,s.jsxs)("tr",{children:[(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/3",children:"Context"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/6",children:"Nodes"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/3",children:"GPU Types"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/6",children:"#GPUs"})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200 ".concat(z.length>5?"max-h-[250px] overflow-y-auto block":""),children:z.map(e=>{let t=X[e]||[],r=A[e]||[],a=t.reduce((e,t)=>e+(t.gpu_total||0),0),n=Object.keys(t.reduce((e,t)=>(e[t.gpu_name]=(e[t.gpu_name]||0)+(t.gpu_total||0),e),{})).join(", ");return(0,s.jsxs)("tr",{className:"hover:bg-gray-50",children:[(0,s.jsx)("td",{className:"p-3",children:(0,s.jsx)(y.Md,{content:e,className:"text-sm text-muted-foreground",children:(0,s.jsx)("span",{className:"text-blue-600 hover:underline cursor-pointer",onClick:()=>M(e),children:e.length>30?"".concat(e.substring(0,Math.floor(13.5)),"...").concat(e.substring(e.length-Math.ceil(13.5))):e})})}),(0,s.jsx)("td",{className:"p-3",children:r.length}),(0,s.jsx)("td",{className:"p-3",children:n}),(0,s.jsx)("td",{className:"p-3",children:a})]},e)})})]})})}),(0,s.jsx)("div",{children:(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-50",children:(0,s.jsxs)("tr",{children:[(0,s.jsxs)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/4 whitespace-nowrap",children:["GPU",(0,s.jsxs)("span",{className:"ml-2 px-2 py-0.5 bg-green-100 text-green-800 rounded-full text-xs font-medium whitespace-nowrap",children:[L," of ",K," free"]})]}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/4",children:"Requestable"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-1/2",children:(0,s.jsx)("div",{className:"flex items-center",children:(0,s.jsx)("span",{children:"Utilization"})})})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200 ".concat(v.length>5?"max-h-[250px] overflow-y-auto block":""),children:v.map(e=>{let t=e.gpu_total-e.gpu_free,r=e.gpu_total>0?e.gpu_free/e.gpu_total*100:0,a=e.gpu_total>0?t/e.gpu_total*100:0,n=S.filter(t=>t.gpu_name===e.gpu_name).map(e=>e.gpu_requestable_qty_per_node).filter((e,t,r)=>r.indexOf(e)===t).join(", ");return(0,s.jsxs)("tr",{children:[(0,s.jsx)("td",{className:"p-3 font-medium w-24 whitespace-nowrap",children:e.gpu_name}),(0,s.jsxs)("td",{className:"p-3 text-xs text-gray-600",children:[n||"-"," / node"]}),(0,s.jsx)("td",{className:"p-3 w-2/3",children:(0,s.jsx)("div",{className:"flex items-center gap-3",children:(0,s.jsxs)("div",{className:"flex-1 bg-gray-100 rounded-md h-5 flex overflow-hidden shadow-sm min-w-[100px] w-full",children:[a>0&&(0,s.jsx)("div",{style:{width:"".concat(a,"%")},className:"bg-yellow-500 h-full flex items-center justify-center text-white text-xs font-medium",children:a>15&&"".concat(t," used")}),r>0&&(0,s.jsx)("div",{style:{width:"".concat(r,"%")},className:"bg-green-700 h-full flex items-center justify-center text-white text-xs font-medium",children:r>15&&"".concat(e.gpu_free," free")})]})})})]},e.gpu_name)})})]})})})]})]})}):g&&0===v.length?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold mb-4",children:"Kubernetes Infrastructure"}),(0,s.jsx)("p",{className:"text-sm text-gray-500",children:"No Kubernetes GPUs found or Kubernetes is not configured."})]})}):null,r&&!_?(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold mb-4",children:"Cloud"}),(0,s.jsxs)("div",{className:"flex items-center justify-center py-6",children:[(0,s.jsx)(n.Z,{size:24,className:"mr-3"}),(0,s.jsx)("span",{className:"text-gray-500",children:"Loading Cloud..."})]})]})}):(0,s.jsx)("div",{className:"rounded-lg border bg-card text-card-foreground shadow-sm mb-6",children:(0,s.jsxs)("div",{className:"p-5",children:[(0,s.jsxs)("div",{className:"flex items-center mb-4",children:[(0,s.jsx)("h3",{className:"text-lg font-semibold",children:"Cloud"}),(0,s.jsxs)("span",{className:"ml-2 px-2 py-0.5 bg-blue-100 text-blue-800 rounded-full text-xs font-medium",children:[D," of ",U," enabled"]})]}),0===q.length?(0,s.jsx)("p",{className:"text-sm text-gray-500",children:"No enabled clouds available."}):(0,s.jsx)("div",{className:"overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white",children:(0,s.jsxs)("table",{className:"min-w-full text-sm",children:[(0,s.jsx)("thead",{className:"bg-gray-50",children:(0,s.jsxs)("tr",{children:[(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-32",children:"Cloud"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-24",children:"Clusters"}),(0,s.jsx)("th",{className:"p-3 text-left font-medium text-gray-600 w-24",children:"Jobs"})]})}),(0,s.jsx)("tbody",{className:"bg-white divide-y divide-gray-200",children:q.map(e=>(0,s.jsxs)("tr",{className:"hover:bg-gray-50",children:[(0,s.jsx)("td",{className:"p-3 font-medium text-gray-700",children:e.name}),(0,s.jsx)("td",{className:"p-3",children:e.clusters>0?(0,s.jsx)("span",{className:"px-2 py-0.5 bg-blue-100 text-blue-800 rounded text-xs font-medium",children:e.clusters}):(0,s.jsx)("span",{className:"px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium",children:"0"})}),(0,s.jsx)("td",{className:"p-3",children:e.jobs>0?(0,s.jsx)("span",{className:"px-2 py-0.5 bg-green-100 text-green-800 rounded text-xs font-medium",children:e.jobs}):(0,s.jsx)("span",{className:"px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium",children:"0"})})]},e.name))})]})})]})})]})]})}var N=r(9008),w=r.n(N);function v(){return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(w(),{children:(0,s.jsx)("title",{children:"Infra | SkyPilot Dashboard"})}),(0,s.jsx)(_,{})]})}}},function(e){e.O(0,[582,480,888,774,179],function(){return e(e.s=6206)}),_N_E=e.O()}]);
|
@@ -1,308 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
|
3
|
-
set -e
|
4
|
-
|
5
|
-
# Colors for nicer UX
|
6
|
-
RED='\033[0;31m'
|
7
|
-
GREEN='\033[0;32m'
|
8
|
-
YELLOW='\033[1;33m'
|
9
|
-
NC='\033[0m' # No color
|
10
|
-
|
11
|
-
# Variables
|
12
|
-
CLEANUP=false
|
13
|
-
INSTALL_GPU=false
|
14
|
-
POSITIONAL_ARGS=()
|
15
|
-
PASSWORD=""
|
16
|
-
|
17
|
-
# Process all arguments
|
18
|
-
while [[ $# -gt 0 ]]; do
|
19
|
-
case $1 in
|
20
|
-
--cleanup)
|
21
|
-
CLEANUP=true
|
22
|
-
shift
|
23
|
-
;;
|
24
|
-
--password)
|
25
|
-
PASSWORD=$2
|
26
|
-
shift
|
27
|
-
shift
|
28
|
-
;;
|
29
|
-
*)
|
30
|
-
POSITIONAL_ARGS+=("$1")
|
31
|
-
shift
|
32
|
-
;;
|
33
|
-
esac
|
34
|
-
done
|
35
|
-
|
36
|
-
# Restore positional arguments in correct order
|
37
|
-
set -- "${POSITIONAL_ARGS[@]}"
|
38
|
-
|
39
|
-
# Assign positional arguments to variables
|
40
|
-
IPS_FILE=$1
|
41
|
-
USER=$2
|
42
|
-
SSH_KEY=$3
|
43
|
-
CONTEXT_NAME=${4:-default}
|
44
|
-
K3S_TOKEN=mytoken # Any string can be used as the token
|
45
|
-
# Create temporary askpass script for sudo
|
46
|
-
ASKPASS_BLOCK="# Create temporary askpass script
|
47
|
-
ASKPASS_SCRIPT=\$(mktemp)
|
48
|
-
trap 'rm -f \$ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
49
|
-
cat > \$ASKPASS_SCRIPT << EOF
|
50
|
-
#!/bin/bash
|
51
|
-
echo $PASSWORD
|
52
|
-
EOF
|
53
|
-
chmod 700 \$ASKPASS_SCRIPT
|
54
|
-
# Use askpass
|
55
|
-
export SUDO_ASKPASS=\$ASKPASS_SCRIPT
|
56
|
-
"
|
57
|
-
|
58
|
-
# Basic argument checks
|
59
|
-
if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
|
60
|
-
>&2 echo -e "${RED}Error: Missing required arguments.${NC}"
|
61
|
-
>&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup] [--password password]"
|
62
|
-
exit 1
|
63
|
-
fi
|
64
|
-
|
65
|
-
# Check if SSH key exists
|
66
|
-
if [ ! -f "$SSH_KEY" ]; then
|
67
|
-
>&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
|
68
|
-
exit 1
|
69
|
-
fi
|
70
|
-
|
71
|
-
# Check if IPs file exists
|
72
|
-
if [ ! -f "$IPS_FILE" ]; then
|
73
|
-
>&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
|
74
|
-
exit 1
|
75
|
-
fi
|
76
|
-
|
77
|
-
# Get head node and worker nodes from the IPs file
|
78
|
-
HEAD_NODE=$(head -n 1 "$IPS_FILE")
|
79
|
-
WORKER_NODES=$(tail -n +2 "$IPS_FILE")
|
80
|
-
|
81
|
-
# Check if the IPs file is empty or not formatted correctly
|
82
|
-
if [ -z "$HEAD_NODE" ]; then
|
83
|
-
>&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
|
84
|
-
exit 1
|
85
|
-
fi
|
86
|
-
|
87
|
-
# Function to show a progress message
|
88
|
-
progress_message() {
|
89
|
-
echo -e "${YELLOW}➜ $1${NC}"
|
90
|
-
}
|
91
|
-
|
92
|
-
# Step to display success
|
93
|
-
success_message() {
|
94
|
-
echo -e "${GREEN}✔ $1${NC}"
|
95
|
-
}
|
96
|
-
|
97
|
-
# Function to run a command on a remote machine via SSH
|
98
|
-
run_remote() {
|
99
|
-
local NODE_IP=$1
|
100
|
-
local CMD=$2
|
101
|
-
# echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
|
102
|
-
ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
|
103
|
-
}
|
104
|
-
|
105
|
-
# Function to uninstall k3s and clean up the state on a remote machine
|
106
|
-
cleanup_server_node() {
|
107
|
-
local NODE_IP=$1
|
108
|
-
echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
|
109
|
-
run_remote "$NODE_IP" "
|
110
|
-
$ASKPASS_BLOCK
|
111
|
-
echo 'Uninstalling k3s...' &&
|
112
|
-
sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
|
113
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
114
|
-
"
|
115
|
-
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
116
|
-
}
|
117
|
-
|
118
|
-
# Function to uninstall k3s and clean up the state on a remote machine
|
119
|
-
cleanup_agent_node() {
|
120
|
-
local NODE_IP=$1
|
121
|
-
echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
|
122
|
-
run_remote "$NODE_IP" "
|
123
|
-
$ASKPASS_BLOCK
|
124
|
-
echo 'Uninstalling k3s...' &&
|
125
|
-
sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
|
126
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
127
|
-
"
|
128
|
-
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
129
|
-
}
|
130
|
-
|
131
|
-
check_gpu() {
|
132
|
-
local NODE_IP=$1
|
133
|
-
if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
|
134
|
-
return 0 # GPU detected
|
135
|
-
else
|
136
|
-
return 1 # No GPU detected
|
137
|
-
fi
|
138
|
-
}
|
139
|
-
|
140
|
-
# Pre-flight checks
|
141
|
-
run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
|
142
|
-
# TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
|
143
|
-
|
144
|
-
# If --cleanup flag is set, uninstall k3s and exit
|
145
|
-
if [ "$CLEANUP" == "true" ]; then
|
146
|
-
echo -e "${YELLOW}Starting cleanup...${NC}"
|
147
|
-
|
148
|
-
# Clean up head node
|
149
|
-
cleanup_server_node "$HEAD_NODE"
|
150
|
-
|
151
|
-
# Clean up worker nodes
|
152
|
-
for NODE in $WORKER_NODES; do
|
153
|
-
cleanup_agent_node "$NODE"
|
154
|
-
done
|
155
|
-
|
156
|
-
# Remove the context from local kubeconfig if it exists
|
157
|
-
if [ -f "$HOME/.kube/config" ]; then
|
158
|
-
progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
|
159
|
-
kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
|
160
|
-
kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
|
161
|
-
kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
|
162
|
-
# Update the current context to the first available context
|
163
|
-
kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
|
164
|
-
success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
|
165
|
-
fi
|
166
|
-
|
167
|
-
echo -e "${GREEN}Cleanup completed successfully.${NC}"
|
168
|
-
exit 0
|
169
|
-
fi
|
170
|
-
|
171
|
-
# Step 1: Install k3s on the head node
|
172
|
-
progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
|
173
|
-
run_remote "$HEAD_NODE" "
|
174
|
-
$ASKPASS_BLOCK
|
175
|
-
curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sudo -E -A sh - &&
|
176
|
-
mkdir -p ~/.kube &&
|
177
|
-
sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
178
|
-
sudo -A chown \$(id -u):\$(id -g) ~/.kube/config &&
|
179
|
-
for i in {1..3}; do
|
180
|
-
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
181
|
-
break
|
182
|
-
else
|
183
|
-
echo 'Waiting for nodes to be ready...'
|
184
|
-
sleep 5
|
185
|
-
fi
|
186
|
-
done
|
187
|
-
if [ \$i -eq 3 ]; then
|
188
|
-
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
189
|
-
exit 1
|
190
|
-
fi"
|
191
|
-
success_message "K3s deployed on head node."
|
192
|
-
|
193
|
-
# Check if head node has a GPU
|
194
|
-
if check_gpu "$HEAD_NODE"; then
|
195
|
-
echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
|
196
|
-
INSTALL_GPU=true
|
197
|
-
fi
|
198
|
-
|
199
|
-
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
200
|
-
MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
|
201
|
-
|
202
|
-
echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
|
203
|
-
|
204
|
-
# Step 2: Install k3s on worker nodes and join them to the master node
|
205
|
-
for NODE in $WORKER_NODES; do
|
206
|
-
progress_message "Deploying Kubernetes on worker node ($NODE)..."
|
207
|
-
run_remote "$NODE" "
|
208
|
-
$ASKPASS_BLOCK
|
209
|
-
curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sudo -E -A sh -"
|
210
|
-
success_message "Kubernetes deployed on worker node ($NODE)."
|
211
|
-
|
212
|
-
# Check if worker node has a GPU
|
213
|
-
if check_gpu "$NODE"; then
|
214
|
-
echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
|
215
|
-
INSTALL_GPU=true
|
216
|
-
fi
|
217
|
-
done
|
218
|
-
# Step 3: Configure local kubectl to connect to the cluster
|
219
|
-
progress_message "Configuring local kubectl to connect to the cluster..."
|
220
|
-
|
221
|
-
# Create temporary directory for kubeconfig operations
|
222
|
-
TEMP_DIR=$(mktemp -d)
|
223
|
-
TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
|
224
|
-
|
225
|
-
# Get the kubeconfig from remote server
|
226
|
-
scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
|
227
|
-
|
228
|
-
# Create .kube directory if it doesn't exist
|
229
|
-
mkdir -p "$HOME/.kube"
|
230
|
-
|
231
|
-
# Create empty kubeconfig if it doesn't exist
|
232
|
-
KUBECONFIG_FILE="$HOME/.kube/config"
|
233
|
-
if [[ ! -f "$KUBECONFIG_FILE" ]]; then
|
234
|
-
touch "$KUBECONFIG_FILE"
|
235
|
-
fi
|
236
|
-
|
237
|
-
# Modify the temporary kubeconfig to update server address and context name
|
238
|
-
awk -v context="$CONTEXT_NAME" '
|
239
|
-
/^clusters:/ { in_cluster = 1 }
|
240
|
-
/^users:/ { in_cluster = 0 }
|
241
|
-
in_cluster && /^ *certificate-authority-data:/ { next }
|
242
|
-
in_cluster && /^ *server:/ {
|
243
|
-
print " server: https://'${HEAD_NODE}:6443'"
|
244
|
-
print " insecure-skip-tls-verify: true"
|
245
|
-
next
|
246
|
-
}
|
247
|
-
/name: default/ { sub("name: default", "name: " context) }
|
248
|
-
/cluster: default/ { sub("cluster: default", "cluster: " context) }
|
249
|
-
/user: default/ { sub("user: default", "user: " context) }
|
250
|
-
/current-context: default/ { sub("current-context: default", "current-context: " context) }
|
251
|
-
{ print }
|
252
|
-
' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
|
253
|
-
|
254
|
-
# Merge the configurations using kubectl
|
255
|
-
KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
|
256
|
-
mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
|
257
|
-
|
258
|
-
# Set the new context as the current context
|
259
|
-
kubectl config use-context "$CONTEXT_NAME"
|
260
|
-
|
261
|
-
# Clean up temporary files
|
262
|
-
rm -rf "$TEMP_DIR"
|
263
|
-
|
264
|
-
success_message "kubectl configured with new context '$CONTEXT_NAME'."
|
265
|
-
|
266
|
-
echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
|
267
|
-
|
268
|
-
# Install GPU operator if a GPU was detected on any node
|
269
|
-
if [ "$INSTALL_GPU" == "true" ]; then
|
270
|
-
echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
|
271
|
-
run_remote "$HEAD_NODE" "
|
272
|
-
$ASKPASS_BLOCK
|
273
|
-
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
274
|
-
chmod 700 get_helm.sh &&
|
275
|
-
./get_helm.sh &&
|
276
|
-
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
277
|
-
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
278
|
-
sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
279
|
-
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
|
280
|
-
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
|
281
|
-
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
|
282
|
-
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
|
283
|
-
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
|
284
|
-
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
|
285
|
-
--set 'toolkit.env[2].value=nvidia' &&
|
286
|
-
echo 'Waiting for GPU operator installation...' &&
|
287
|
-
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
|
288
|
-
echo 'Waiting for GPU operator...'
|
289
|
-
sleep 5
|
290
|
-
done
|
291
|
-
echo 'GPU operator installed successfully.'"
|
292
|
-
success_message "GPU Operator installed."
|
293
|
-
else
|
294
|
-
echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
|
295
|
-
fi
|
296
|
-
|
297
|
-
# Configure SkyPilot
|
298
|
-
progress_message "Configuring SkyPilot..."
|
299
|
-
sky check kubernetes
|
300
|
-
success_message "SkyPilot configured successfully."
|
301
|
-
|
302
|
-
# Display final success message
|
303
|
-
echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
|
304
|
-
echo "You can now interact with your Kubernetes cluster through SkyPilot: "
|
305
|
-
echo " • List available GPUs: sky show-gpus --cloud kubernetes"
|
306
|
-
echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
|
307
|
-
echo " • Connect to pod with SSH: ssh devbox"
|
308
|
-
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
|
/sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js
RENAMED
File without changes
|
{skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|