skypilot-nightly 1.0.0.dev20241110__tar.gz → 1.0.0.dev20241112__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (303) hide show
  1. {skypilot_nightly-1.0.0.dev20241110/skypilot_nightly.egg-info → skypilot_nightly-1.0.0.dev20241112}/PKG-INFO +2 -2
  2. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/README.md +1 -1
  3. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/__init__.py +2 -2
  4. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/cli.py +7 -3
  5. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/kubernetes_catalog.py +34 -11
  6. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/docker_utils.py +1 -1
  7. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/instance.py +104 -102
  8. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/utils.py +26 -14
  9. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/__init__.py +2 -0
  10. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/load_balancer.py +34 -8
  11. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/load_balancing_policies.py +23 -1
  12. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/service.py +4 -1
  13. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/service_spec.py +19 -0
  14. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/kubernetes-ray.yml.j2 +21 -1
  15. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/generate_kubeconfig.sh +3 -0
  16. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/schemas.py +8 -0
  17. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112/skypilot_nightly.egg-info}/PKG-INFO +2 -2
  18. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_smoke.py +5 -5
  19. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/LICENSE +0 -0
  20. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/MANIFEST.in +0 -0
  21. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/pyproject.toml +0 -0
  22. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/setup.cfg +0 -0
  23. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/setup.py +0 -0
  24. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/__init__.py +0 -0
  25. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/aws.py +0 -0
  26. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/azure.py +0 -0
  27. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/cloudflare.py +0 -0
  28. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/common.py +0 -0
  29. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/cudo.py +0 -0
  30. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/docker.py +0 -0
  31. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/gcp.py +0 -0
  32. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/ibm.py +0 -0
  33. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/kubernetes.py +0 -0
  34. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/oci.py +0 -0
  35. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/runpod.py +0 -0
  36. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/adaptors/vsphere.py +0 -0
  37. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/admin_policy.py +0 -0
  38. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/authentication.py +0 -0
  39. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/__init__.py +0 -0
  40. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/backend.py +0 -0
  41. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/backend_utils.py +0 -0
  42. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/cloud_vm_ray_backend.py +0 -0
  43. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/docker_utils.py +0 -0
  44. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/local_docker_backend.py +0 -0
  45. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/monkey_patches/monkey_patch_ray_up.py +0 -0
  46. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/backends/wheel_utils.py +0 -0
  47. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/benchmark/__init__.py +0 -0
  48. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/benchmark/benchmark_state.py +0 -0
  49. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/benchmark/benchmark_utils.py +0 -0
  50. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/check.py +0 -0
  51. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/cloud_stores.py +0 -0
  52. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/__init__.py +0 -0
  53. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/aws.py +0 -0
  54. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/azure.py +0 -0
  55. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/cloud.py +0 -0
  56. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/cloud_registry.py +0 -0
  57. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/cudo.py +0 -0
  58. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/fluidstack.py +0 -0
  59. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/gcp.py +0 -0
  60. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/ibm.py +0 -0
  61. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/kubernetes.py +0 -0
  62. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/lambda_cloud.py +0 -0
  63. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/oci.py +0 -0
  64. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/paperspace.py +0 -0
  65. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/runpod.py +0 -0
  66. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/scp.py +0 -0
  67. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/__init__.py +0 -0
  68. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/aws_catalog.py +0 -0
  69. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/azure_catalog.py +0 -0
  70. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/common.py +0 -0
  71. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/config.py +0 -0
  72. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/constants.py +0 -0
  73. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/cudo_catalog.py +0 -0
  74. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/__init__.py +0 -0
  75. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +0 -0
  76. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +0 -0
  77. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +0 -0
  78. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +0 -0
  79. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +0 -0
  80. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +0 -0
  81. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +0 -0
  82. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/fluidstack_catalog.py +0 -0
  83. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/gcp_catalog.py +0 -0
  84. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/ibm_catalog.py +0 -0
  85. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/lambda_catalog.py +0 -0
  86. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/oci_catalog.py +0 -0
  87. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/paperspace_catalog.py +0 -0
  88. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/runpod_catalog.py +0 -0
  89. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/scp_catalog.py +0 -0
  90. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/service_catalog/vsphere_catalog.py +0 -0
  91. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/utils/__init__.py +0 -0
  92. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/utils/aws_utils.py +0 -0
  93. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/utils/azure_utils.py +0 -0
  94. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/utils/gcp_utils.py +0 -0
  95. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/utils/oci_utils.py +0 -0
  96. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/utils/scp_utils.py +0 -0
  97. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/clouds/vsphere.py +0 -0
  98. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/core.py +0 -0
  99. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/dag.py +0 -0
  100. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/data/__init__.py +0 -0
  101. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/data/data_transfer.py +0 -0
  102. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/data/data_utils.py +0 -0
  103. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/data/mounting_utils.py +0 -0
  104. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/data/storage.py +0 -0
  105. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/data/storage_utils.py +0 -0
  106. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/exceptions.py +0 -0
  107. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/execution.py +0 -0
  108. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/global_user_state.py +0 -0
  109. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/__init__.py +0 -0
  110. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/constants.py +0 -0
  111. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/controller.py +0 -0
  112. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/core.py +0 -0
  113. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/dashboard/dashboard.py +0 -0
  114. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/dashboard/static/favicon.ico +0 -0
  115. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/dashboard/templates/index.html +0 -0
  116. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/recovery_strategy.py +0 -0
  117. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/state.py +0 -0
  118. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/jobs/utils.py +0 -0
  119. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/optimizer.py +0 -0
  120. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/__init__.py +0 -0
  121. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/aws/__init__.py +0 -0
  122. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/aws/config.py +0 -0
  123. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/aws/instance.py +0 -0
  124. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/aws/utils.py +0 -0
  125. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/azure/__init__.py +0 -0
  126. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/azure/azure-config-template.json +0 -0
  127. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/azure/config.py +0 -0
  128. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/azure/instance.py +0 -0
  129. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/common.py +0 -0
  130. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/constants.py +0 -0
  131. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/cudo/__init__.py +0 -0
  132. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/cudo/config.py +0 -0
  133. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/cudo/cudo_machine_type.py +0 -0
  134. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/cudo/cudo_utils.py +0 -0
  135. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/cudo/cudo_wrapper.py +0 -0
  136. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/cudo/instance.py +0 -0
  137. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/fluidstack/__init__.py +0 -0
  138. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/fluidstack/config.py +0 -0
  139. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/fluidstack/fluidstack_utils.py +0 -0
  140. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/fluidstack/instance.py +0 -0
  141. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/gcp/__init__.py +0 -0
  142. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/gcp/config.py +0 -0
  143. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/gcp/constants.py +0 -0
  144. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/gcp/instance.py +0 -0
  145. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/gcp/instance_utils.py +0 -0
  146. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/gcp/mig_utils.py +0 -0
  147. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/instance_setup.py +0 -0
  148. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/__init__.py +0 -0
  149. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/config.py +0 -0
  150. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -0
  151. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -0
  152. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/network.py +0 -0
  153. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/kubernetes/network_utils.py +0 -0
  154. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/lambda_cloud/__init__.py +0 -0
  155. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/lambda_cloud/config.py +0 -0
  156. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/lambda_cloud/instance.py +0 -0
  157. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/lambda_cloud/lambda_utils.py +0 -0
  158. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/logging.py +0 -0
  159. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/metadata_utils.py +0 -0
  160. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/oci/__init__.py +0 -0
  161. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/oci/config.py +0 -0
  162. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/oci/instance.py +0 -0
  163. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/oci/query_utils.py +0 -0
  164. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/paperspace/__init__.py +0 -0
  165. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/paperspace/config.py +0 -0
  166. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/paperspace/constants.py +0 -0
  167. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/paperspace/instance.py +0 -0
  168. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/paperspace/utils.py +0 -0
  169. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/provisioner.py +0 -0
  170. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/runpod/__init__.py +0 -0
  171. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/runpod/config.py +0 -0
  172. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/runpod/instance.py +0 -0
  173. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/runpod/utils.py +0 -0
  174. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/__init__.py +0 -0
  175. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/__init__.py +0 -0
  176. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/cls_api_client.py +0 -0
  177. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/cls_api_helper.py +0 -0
  178. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/custom_script.py +0 -0
  179. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/id_generator.py +0 -0
  180. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/metadata_utils.py +0 -0
  181. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/service_manager.py +0 -0
  182. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/service_manager_factory.py +0 -0
  183. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/ssl_helper.py +0 -0
  184. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/vapiconnect.py +0 -0
  185. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/common/vim_utils.py +0 -0
  186. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/config.py +0 -0
  187. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/instance.py +0 -0
  188. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/provision/vsphere/vsphere_utils.py +0 -0
  189. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/resources.py +0 -0
  190. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/autoscalers.py +0 -0
  191. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/constants.py +0 -0
  192. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/controller.py +0 -0
  193. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/core.py +0 -0
  194. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/replica_managers.py +0 -0
  195. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/serve_state.py +0 -0
  196. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/serve/serve_utils.py +0 -0
  197. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/setup_files/MANIFEST.in +0 -0
  198. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/setup_files/setup.py +0 -0
  199. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/sky_logging.py +0 -0
  200. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/LICENSE +0 -0
  201. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/__init__.py +0 -0
  202. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/attempt_skylet.py +0 -0
  203. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/autostop_lib.py +0 -0
  204. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/configs.py +0 -0
  205. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/constants.py +0 -0
  206. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/events.py +0 -0
  207. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/job_lib.py +0 -0
  208. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/log_lib.py +0 -0
  209. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/log_lib.pyi +0 -0
  210. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/__init__.py +0 -0
  211. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/command_runner.py +0 -0
  212. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/ibm/__init__.py +0 -0
  213. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/ibm/node_provider.py +0 -0
  214. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/ibm/utils.py +0 -0
  215. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/ibm/vpc_provider.py +0 -0
  216. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/scp/__init__.py +0 -0
  217. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/scp/config.py +0 -0
  218. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/providers/scp/node_provider.py +0 -0
  219. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/__init__.py +0 -0
  220. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/autoscaler.py.patch +0 -0
  221. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/cli.py.patch +0 -0
  222. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/command_runner.py.patch +0 -0
  223. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/log_monitor.py.patch +0 -0
  224. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/resource_demand_scheduler.py.patch +0 -0
  225. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/updater.py.patch +0 -0
  226. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/ray_patches/worker.py.patch +0 -0
  227. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/skylet.py +0 -0
  228. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skylet/subprocess_daemon.py +0 -0
  229. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/skypilot_config.py +0 -0
  230. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/status_lib.py +0 -0
  231. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/task.py +0 -0
  232. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/aws-ray.yml.j2 +0 -0
  233. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/azure-ray.yml.j2 +0 -0
  234. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/cudo-ray.yml.j2 +0 -0
  235. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/fluidstack-ray.yml.j2 +0 -0
  236. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/gcp-ray.yml.j2 +0 -0
  237. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/ibm-ray.yml.j2 +0 -0
  238. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/jobs-controller.yaml.j2 +0 -0
  239. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/kubernetes-ingress.yml.j2 +0 -0
  240. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/kubernetes-loadbalancer.yml.j2 +0 -0
  241. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/kubernetes-port-forward-proxy-command.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/kubernetes-ssh-jump.yml.j2 +0 -0
  243. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/lambda-ray.yml.j2 +0 -0
  244. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/local-ray.yml.j2 +0 -0
  245. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/oci-ray.yml.j2 +0 -0
  246. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/paperspace-ray.yml.j2 +0 -0
  247. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/runpod-ray.yml.j2 +0 -0
  248. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/scp-ray.yml.j2 +0 -0
  249. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/sky-serve-controller.yaml.j2 +0 -0
  250. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/templates/vsphere-ray.yml.j2 +0 -0
  251. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/usage/__init__.py +0 -0
  252. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/usage/constants.py +0 -0
  253. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/usage/usage_lib.py +0 -0
  254. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/__init__.py +0 -0
  255. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/accelerator_registry.py +0 -0
  256. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/admin_policy_utils.py +0 -0
  257. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/cli_utils/__init__.py +0 -0
  258. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/cli_utils/status_utils.py +0 -0
  259. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/cluster_yaml_utils.py +0 -0
  260. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/command_runner.py +0 -0
  261. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/command_runner.pyi +0 -0
  262. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/common_utils.py +0 -0
  263. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/control_master_utils.py +0 -0
  264. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/controller_utils.py +0 -0
  265. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/dag_utils.py +0 -0
  266. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/db_utils.py +0 -0
  267. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/env_options.py +0 -0
  268. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/__init__.py +0 -0
  269. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/create_cluster.sh +0 -0
  270. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/delete_cluster.sh +0 -0
  271. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/deploy_remote_cluster.sh +0 -0
  272. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/generate_kind_config.py +0 -0
  273. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/gpu_labeler.py +0 -0
  274. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +0 -0
  275. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +0 -0
  276. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/rsync_helper.sh +0 -0
  277. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -0
  278. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/kubernetes_enums.py +0 -0
  279. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/log_utils.py +0 -0
  280. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/resources_utils.py +0 -0
  281. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/rich_utils.py +0 -0
  282. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/subprocess_utils.py +0 -0
  283. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/timeline.py +0 -0
  284. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/ux_utils.py +0 -0
  285. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/sky/utils/validator.py +0 -0
  286. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/skypilot_nightly.egg-info/SOURCES.txt +0 -0
  287. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/skypilot_nightly.egg-info/dependency_links.txt +0 -0
  288. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/skypilot_nightly.egg-info/entry_points.txt +0 -0
  289. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/skypilot_nightly.egg-info/requires.txt +0 -0
  290. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/skypilot_nightly.egg-info/top_level.txt +0 -0
  291. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_api.py +0 -0
  292. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_cli.py +0 -0
  293. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_config.py +0 -0
  294. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_global_user_state.py +0 -0
  295. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_jobs.py +0 -0
  296. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_jobs_and_serve.py +0 -0
  297. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_list_accelerators.py +0 -0
  298. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_optimizer_dryruns.py +0 -0
  299. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_optimizer_random_dag.py +0 -0
  300. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_serve_autoscaler.py +0 -0
  301. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_storage.py +0 -0
  302. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_wheels.py +0 -0
  303. {skypilot_nightly-1.0.0.dev20241110 → skypilot_nightly-1.0.0.dev20241112}/tests/test_yaml_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241110
3
+ Version: 1.0.0.dev20241112
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -319,7 +319,7 @@ Runnable examples:
319
319
  - [LocalGPT](./llm/localgpt)
320
320
  - [Falcon](./llm/falcon)
321
321
  - Add yours here & see more in [`llm/`](./llm)!
322
- - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
322
+ - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
323
323
 
324
324
  Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
325
325
 
@@ -183,7 +183,7 @@ Runnable examples:
183
183
  - [LocalGPT](./llm/localgpt)
184
184
  - [Falcon](./llm/falcon)
185
185
  - Add yours here & see more in [`llm/`](./llm)!
186
- - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
186
+ - Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Cog](https://github.com/skypilot-org/skypilot/blob/master/examples/cog/), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml), [Ollama](https://github.com/skypilot-org/skypilot/blob/master/llm/ollama), [llm.c](https://github.com/skypilot-org/skypilot/tree/master/llm/gpt-2), [Airflow](./examples/airflow/training_workflow) and [many more (`examples/`)](./examples).
187
187
 
188
188
  Case Studies and Integrations: [Community Spotlights](https://blog.skypilot.co/community/)
189
189
 
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'dddd65187953a5d6b32f762bea78eed1f109ec3c'
8
+ _SKYPILOT_COMMIT_SHA = '140125eaad5fb64da37934c8f6650d68aa135f77'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241110'
38
+ __version__ = '1.0.0.dev20241112'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3102,6 +3102,7 @@ def show_gpus(
3102
3102
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3103
3103
  kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3104
3104
  sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3105
+ no_permissions_str = '<no permissions>'
3105
3106
 
3106
3107
  def _list_to_str(lst):
3107
3108
  return ', '.join([str(e) for e in lst])
@@ -3146,9 +3147,11 @@ def show_gpus(
3146
3147
  debug_msg)
3147
3148
  raise ValueError(full_err_msg)
3148
3149
  for gpu, _ in sorted(counts.items()):
3150
+ available_qty = available[gpu] if available[gpu] != -1 else (
3151
+ no_permissions_str)
3149
3152
  realtime_gpu_table.add_row([
3150
3153
  gpu,
3151
- _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]
3154
+ _list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
3152
3155
  ])
3153
3156
  return realtime_gpu_table
3154
3157
 
@@ -3158,10 +3161,11 @@ def show_gpus(
3158
3161
 
3159
3162
  node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
3160
3163
  for node_name, node_info in node_info_dict.items():
3164
+ available = node_info.free['nvidia.com/gpu'] if node_info.free[
3165
+ 'nvidia.com/gpu'] != -1 else no_permissions_str
3161
3166
  node_table.add_row([
3162
3167
  node_name, node_info.gpu_type,
3163
- node_info.total['nvidia.com/gpu'],
3164
- node_info.free['nvidia.com/gpu']
3168
+ node_info.total['nvidia.com/gpu'], available
3165
3169
  ])
3166
3170
  return node_table
3167
3171
 
@@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Set, Tuple
10
10
  from sky import check as sky_check
11
11
  from sky import sky_logging
12
12
  from sky.adaptors import common as adaptors_common
13
+ from sky.adaptors import kubernetes
13
14
  from sky.clouds import Kubernetes
14
15
  from sky.clouds.service_catalog import CloudFilter
15
16
  from sky.clouds.service_catalog import common
@@ -22,6 +23,8 @@ if typing.TYPE_CHECKING:
22
23
  else:
23
24
  pd = adaptors_common.LazyImport('pandas')
24
25
 
26
+ logger = sky_logging.init_logger(__name__)
27
+
25
28
  _PULL_FREQUENCY_HOURS = 7
26
29
 
27
30
  # We keep pull_frequency_hours so we can remotely update the default image paths
@@ -77,6 +80,11 @@ def list_accelerators_realtime(
77
80
  require_price: bool = True
78
81
  ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
79
82
  int]]:
83
+ """List accelerators in the Kubernetes cluster.
84
+
85
+ If the user does not have sufficient permissions to list pods in all
86
+ namespaces, the function will return free GPUs as -1.
87
+ """
80
88
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
81
89
  # function from kubernetes_utils.
82
90
  del all_regions, require_price # Unused.
@@ -108,7 +116,17 @@ def list_accelerators_realtime(
108
116
  key = label_formatter.get_label_key()
109
117
  nodes = kubernetes_utils.get_kubernetes_nodes(context)
110
118
  # Get the pods to get the real-time GPU usage
111
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
119
+ try:
120
+ pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
121
+ except kubernetes.api_exception() as e:
122
+ if e.status == 403:
123
+ logger.warning('Failed to get pods in the Kubernetes cluster '
124
+ '(forbidden). Please check if your account has '
125
+ 'necessary permissions to list pods. Realtime GPU '
126
+ 'availability information may be incorrect.')
127
+ pods = None
128
+ else:
129
+ raise
112
130
  # Total number of GPUs in the cluster
113
131
  total_accelerators_capacity: Dict[str, int] = {}
114
132
  # Total number of GPUs currently available in the cluster
@@ -141,6 +159,21 @@ def list_accelerators_realtime(
141
159
  if accelerator_count not in accelerators_qtys:
142
160
  accelerators_qtys.add((accelerator_name, accelerator_count))
143
161
 
162
+ if accelerator_count >= min_quantity_filter:
163
+ quantized_count = (min_quantity_filter *
164
+ (accelerator_count // min_quantity_filter))
165
+ if accelerator_name not in total_accelerators_capacity:
166
+ total_accelerators_capacity[
167
+ accelerator_name] = quantized_count
168
+ else:
169
+ total_accelerators_capacity[
170
+ accelerator_name] += quantized_count
171
+
172
+ if pods is None:
173
+ # If we can't get the pods, we can't get the GPU usage
174
+ total_accelerators_available[accelerator_name] = -1
175
+ continue
176
+
144
177
  for pod in pods:
145
178
  # Get all the pods running on the node
146
179
  if (pod.spec.node_name == node.metadata.name and
@@ -155,16 +188,6 @@ def list_accelerators_realtime(
155
188
 
156
189
  accelerators_available = accelerator_count - allocated_qty
157
190
 
158
- if accelerator_count >= min_quantity_filter:
159
- quantized_count = (min_quantity_filter *
160
- (accelerator_count // min_quantity_filter))
161
- if accelerator_name not in total_accelerators_capacity:
162
- total_accelerators_capacity[
163
- accelerator_name] = quantized_count
164
- else:
165
- total_accelerators_capacity[
166
- accelerator_name] += quantized_count
167
-
168
191
  if accelerator_name not in total_accelerators_available:
169
192
  total_accelerators_available[accelerator_name] = 0
170
193
  if accelerators_available >= min_quantity_filter:
@@ -20,7 +20,7 @@ SETUP_ENV_VARS_CMD = (
20
20
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
21
  'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
22
22
  '~/container_env_var.sh && '
23
- '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
23
+ '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
24
24
  )
25
25
 
26
26
  # Docker daemon may not be ready when the machine is firstly started. The error
@@ -333,52 +333,37 @@ def _run_function_with_retries(func: Callable,
333
333
  raise
334
334
 
335
335
 
336
- def _set_env_vars_in_pods(namespace: str, context: Optional[str],
337
- new_pods: List):
338
- """Setting environment variables in pods.
339
-
340
- Once all containers are ready, we can exec into them and set env vars.
341
- Kubernetes automatically populates containers with critical
342
- environment variables, such as those for discovering services running
343
- in the cluster and CUDA/nvidia environment variables. We need to
344
- make sure these env vars are available in every task and ssh session.
345
- This is needed for GPU support and service discovery.
346
- See https://github.com/skypilot-org/skypilot/issues/2287 for
347
- more details.
348
-
349
- To do so, we capture env vars from the pod's runtime and write them to
350
- /etc/profile.d/, making them available for all users in future
351
- shell sessions.
352
- """
353
- set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
336
+ def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
337
+ """Pre-initialization step for SkyPilot pods.
354
338
 
355
- def _set_env_vars_thread(new_pod):
356
- pod_name = new_pod.metadata.name
357
- logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
358
- f'{"-"*20}')
359
- runner = command_runner.KubernetesCommandRunner(
360
- ((namespace, context), pod_name))
339
+ This step is run in the pod right after it is created and before the
340
+ SkyPilot runtime is setup.
361
341
 
362
- def _run_env_vars_cmd():
363
- rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
364
- require_outputs=True,
365
- stream_logs=False)
366
- _raise_command_running_error('set env vars', set_k8s_env_var_cmd,
367
- pod_name, rc, stdout)
342
+ This step includes three key steps:
368
343
 
369
- _run_function_with_retries(_run_env_vars_cmd,
370
- f'set env vars in pod {pod_name}')
371
- logger.info(f'{"-"*20}End: Set up env vars in pod {pod_name!r} '
372
- f'{"-"*20}')
344
+ 1. Privilege check: Checks if the default user has sufficient privilege
345
+ to set up the kubernetes instance pod.
346
+ 2. SSH setup: Sets up SSH for the pod instance.
347
+ 3. Environment variable setup to populate k8s env vars in the pod.
373
348
 
374
- subprocess_utils.run_in_parallel(_set_env_vars_thread, new_pods,
375
- NUM_THREADS)
349
+ Make sure commands used in these methods are generic and work
350
+ on most base images. E.g., do not use Python, since that may not
351
+ be installed by default.
376
352
 
353
+ If you run any apt commands, be sure to check if the lock is available.
354
+ It is possible the `apt update` run in the pod container args may still
355
+ be running.
356
+
357
+ Args:
358
+ namespace (str): Kubernetes namespace.
359
+ context (Optional[str]): Kubernetes context.
360
+ new_nodes (List): List of new pod instances.
361
+
362
+ Raises:
363
+ config_lib.KubernetesError: If user privileges are insufficient or
364
+ setup fails.
365
+ """
377
366
 
378
- def _check_user_privilege(namespace: str, context: Optional[str],
379
- new_nodes: List) -> None:
380
- # Checks if the default user has sufficient privilege to set up
381
- # the kubernetes instance pod.
382
367
  check_k8s_user_sudo_cmd = (
383
368
  'if [ $(id -u) -eq 0 ]; then'
384
369
  # If user is root, create an alias for sudo used in skypilot setup
@@ -386,56 +371,67 @@ def _check_user_privilege(namespace: str, context: Optional[str],
386
371
  'else '
387
372
  ' if command -v sudo >/dev/null 2>&1; then '
388
373
  ' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
389
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
374
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
375
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
390
376
  ' else '
391
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
377
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
378
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
392
379
  ' fi; '
393
- 'fi')
380
+ 'fi;')
381
+
382
+ # Kubernetes automatically populates containers with critical
383
+ # environment variables, such as those for discovering services running
384
+ # in the cluster and CUDA/nvidia environment variables. We need to
385
+ # make sure these env vars are available in every task and ssh session.
386
+ # This is needed for GPU support and service discovery.
387
+ # See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
388
+ # To do so, we capture env vars from the pod's runtime and write them to
389
+ # /etc/profile.d/, making them available for all users in future
390
+ # shell sessions.
391
+ set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
394
392
 
395
- # This check needs to run on a per-image basis, so running the check on
396
- # any one pod is sufficient.
397
- new_node = new_nodes[0]
398
- pod_name = new_node.metadata.name
393
+ check_apt_update_complete_cmd = (
394
+ 'echo "Checking if apt update from container init is complete..."; '
395
+ 'timeout_secs=600; '
396
+ 'start_time=$(date +%s); '
397
+ 'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
398
+ ' echo "apt update still running. Logs:"; '
399
+ ' cat /tmp/apt-update.log; '
400
+ ' current_time=$(date +%s); '
401
+ ' elapsed=$((current_time - start_time)); '
402
+ ' if [ $elapsed -ge $timeout_secs ]; then '
403
+ ' echo "Timed out waiting for apt update"; '
404
+ ' exit 1; '
405
+ ' fi; '
406
+ ' sleep 5; '
407
+ 'done; '
408
+ 'echo "apt update complete."; ')
399
409
 
400
- runner = command_runner.KubernetesCommandRunner(
401
- ((namespace, context), pod_name))
402
- logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
403
- f'{"-"*20}')
404
-
405
- def _run_privilege_check():
406
- rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
407
- require_outputs=True,
408
- separate_stderr=True,
409
- stream_logs=False)
410
- _raise_command_running_error('check user privilege',
411
- check_k8s_user_sudo_cmd, pod_name, rc,
412
- stdout + stderr)
413
- return stdout
414
-
415
- stdout = _run_function_with_retries(
416
- _run_privilege_check, f'check user privilege in pod {pod_name!r}')
417
-
418
- if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
419
- raise config_lib.KubernetesError(
420
- 'Insufficient system privileges detected. '
421
- 'Ensure the default user has root access or '
422
- '"sudo" is installed and the user is added to the sudoers '
423
- 'from the image.')
424
- logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
425
- f'{"-"*20}')
426
-
427
-
428
- def _setup_ssh_in_pods(namespace: str, context: Optional[str],
429
- new_nodes: List) -> None:
430
- # Setting up ssh for the pod instance. This is already setup for
431
- # the jump pod so it does not need to be run for it.
432
- set_k8s_ssh_cmd = (
433
- 'set -ex; '
410
+ install_ssh_k8s_cmd = (
434
411
  'prefix_cmd() '
435
412
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
436
413
  'export DEBIAN_FRONTEND=noninteractive;'
437
- '$(prefix_cmd) apt-get update;'
438
- '$(prefix_cmd) apt install openssh-server rsync -y; '
414
+ 'echo "Installing missing packages..."; '
415
+ 'for i in {1..5}; do '
416
+ ' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
417
+ ' rc=$?; '
418
+ ' if [ $rc -eq 0 ]; then '
419
+ ' break; '
420
+ ' fi; '
421
+ ' echo "$output" | grep -qi "could not get lock" || '
422
+ ' grep -qi "Unable to acquire the dpkg frontend lock"; '
423
+ ' if [ $? -eq 0 ]; then '
424
+ ' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
425
+ ' sleep 5; '
426
+ ' else '
427
+ ' echo "apt install failed for a non-lock reason: $output"; '
428
+ ' exit $rc; '
429
+ ' fi; '
430
+ 'done; '
431
+ 'if [ $rc -ne 0 ]; then '
432
+ ' echo "apt install failed after 5 attempts due to lock errors."; '
433
+ ' exit $rc; '
434
+ 'fi; '
439
435
  '$(prefix_cmd) mkdir -p /var/run/sshd; '
440
436
  '$(prefix_cmd) '
441
437
  'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
@@ -456,24 +452,35 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
456
452
  # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
457
453
  '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
458
454
 
459
- def _setup_ssh_thread(new_node):
455
+ pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
456
+ set_k8s_env_var_cmd + check_apt_update_complete_cmd +
457
+ install_ssh_k8s_cmd)
458
+
459
+ def _pre_init_thread(new_node):
460
460
  pod_name = new_node.metadata.name
461
+ logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
461
462
  runner = command_runner.KubernetesCommandRunner(
462
463
  ((namespace, context), pod_name))
463
- logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
464
464
 
465
- def _run_ssh_setup():
466
- rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
467
- require_outputs=True,
468
- stream_logs=False)
469
- _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name,
470
- rc, stdout)
465
+ # Run the combined pre-init command
466
+ rc, stdout, _ = runner.run(pre_init_cmd,
467
+ require_outputs=True,
468
+ stream_logs=False)
469
+ if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
470
+ raise config_lib.KubernetesError(
471
+ 'Insufficient system privileges detected. '
472
+ 'Ensure the default user has root access or '
473
+ '"sudo" is installed and the user is added to the sudoers '
474
+ 'from the image.')
475
+
476
+ op_name = 'pre-init'
477
+ _raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
478
+ stdout)
471
479
 
472
- _run_function_with_retries(_run_ssh_setup,
473
- f'setup ssh in pod {pod_name!r}')
474
- logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
480
+ logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
475
481
 
476
- subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes, NUM_THREADS)
482
+ # Run pre_init in parallel across all new_nodes
483
+ subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
477
484
 
478
485
 
479
486
  def _label_pod(namespace: str, context: Optional[str], pod_name: str,
@@ -724,13 +731,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
724
731
  f'pods: {list(uninitialized_pods.keys())}')
725
732
  uninitialized_pods_list = list(uninitialized_pods.values())
726
733
 
727
- # Setup SSH and environment variables in pods.
728
- # Make sure commands used in these methods are generic and work
729
- # on most base images. E.g., do not use Python, since that may not
730
- # be installed by default.
731
- _check_user_privilege(namespace, context, uninitialized_pods_list)
732
- _setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
733
- _set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
734
+ # Run pre-init steps in the pod.
735
+ pre_init(namespace, context, uninitialized_pods_list)
734
736
 
735
737
  for pod in uninitialized_pods.values():
736
738
  _label_pod(namespace,
@@ -1801,13 +1801,22 @@ def get_kubernetes_node_info(
1801
1801
  number of GPUs available on the node and the number of free GPUs on the
1802
1802
  node.
1803
1803
 
1804
+ If the user does not have sufficient permissions to list pods in all
1805
+ namespaces, the function will return free GPUs as -1.
1806
+
1804
1807
  Returns:
1805
1808
  Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
1806
1809
  key and the KubernetesNodeInfo object as value
1807
1810
  """
1808
1811
  nodes = get_kubernetes_nodes(context)
1809
1812
  # Get the pods to get the real-time resource usage
1810
- pods = get_all_pods_in_kubernetes_cluster(context)
1813
+ try:
1814
+ pods = get_all_pods_in_kubernetes_cluster(context)
1815
+ except kubernetes.api_exception() as e:
1816
+ if e.status == 403:
1817
+ pods = None
1818
+ else:
1819
+ raise
1811
1820
 
1812
1821
  label_formatter, _ = detect_gpu_label_formatter(context)
1813
1822
  if not label_formatter:
@@ -1828,19 +1837,22 @@ def get_kubernetes_node_info(
1828
1837
  accelerator_count = int(node.status.allocatable.get(
1829
1838
  'nvidia.com/gpu', 0))
1830
1839
 
1831
- for pod in pods:
1832
- # Get all the pods running on the node
1833
- if (pod.spec.node_name == node.metadata.name and
1834
- pod.status.phase in ['Running', 'Pending']):
1835
- # Iterate over all the containers in the pod and sum the
1836
- # GPU requests
1837
- for container in pod.spec.containers:
1838
- if container.resources.requests:
1839
- allocated_qty += int(
1840
- container.resources.requests.get(
1841
- 'nvidia.com/gpu', 0))
1842
-
1843
- accelerators_available = accelerator_count - allocated_qty
1840
+ if pods is None:
1841
+ accelerators_available = -1
1842
+
1843
+ else:
1844
+ for pod in pods:
1845
+ # Get all the pods running on the node
1846
+ if (pod.spec.node_name == node.metadata.name and
1847
+ pod.status.phase in ['Running', 'Pending']):
1848
+ # Iterate over all the containers in the pod and sum the
1849
+ # GPU requests
1850
+ for container in pod.spec.containers:
1851
+ if container.resources.requests:
1852
+ allocated_qty += int(
1853
+ container.resources.requests.get(
1854
+ 'nvidia.com/gpu', 0))
1855
+ accelerators_available = accelerator_count - allocated_qty
1844
1856
 
1845
1857
  node_info_dict[node.metadata.name] = KubernetesNodeInfo(
1846
1858
  name=node.metadata.name,
@@ -11,6 +11,7 @@ from sky.serve.core import tail_logs
11
11
  from sky.serve.core import terminate_replica
12
12
  from sky.serve.core import up
13
13
  from sky.serve.core import update
14
+ from sky.serve.load_balancing_policies import LB_POLICIES
14
15
  from sky.serve.serve_state import ReplicaStatus
15
16
  from sky.serve.serve_state import ServiceStatus
16
17
  from sky.serve.serve_utils import DEFAULT_UPDATE_MODE
@@ -35,6 +36,7 @@ __all__ = [
35
36
  'get_endpoint',
36
37
  'INITIAL_VERSION',
37
38
  'LB_CONTROLLER_SYNC_INTERVAL_SECONDS',
39
+ 'LB_POLICIES',
38
40
  'ReplicaStatus',
39
41
  'ServiceComponent',
40
42
  'ServiceStatus',
@@ -2,7 +2,7 @@
2
2
  import asyncio
3
3
  import logging
4
4
  import threading
5
- from typing import Dict, Union
5
+ from typing import Dict, Optional, Union
6
6
 
7
7
  import aiohttp
8
8
  import fastapi
@@ -27,18 +27,24 @@ class SkyServeLoadBalancer:
27
27
  policy.
28
28
  """
29
29
 
30
- def __init__(self, controller_url: str, load_balancer_port: int) -> None:
30
+ def __init__(self,
31
+ controller_url: str,
32
+ load_balancer_port: int,
33
+ load_balancing_policy_name: Optional[str] = None) -> None:
31
34
  """Initialize the load balancer.
32
35
 
33
36
  Args:
34
37
  controller_url: The URL of the controller.
35
38
  load_balancer_port: The port where the load balancer listens to.
39
+ load_balancing_policy_name: The name of the load balancing policy
40
+ to use. Defaults to None.
36
41
  """
37
42
  self._app = fastapi.FastAPI()
38
43
  self._controller_url: str = controller_url
39
44
  self._load_balancer_port: int = load_balancer_port
40
- self._load_balancing_policy: lb_policies.LoadBalancingPolicy = (
41
- lb_policies.RoundRobinPolicy())
45
+ # Use the registry to create the load balancing policy
46
+ self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
47
+ load_balancing_policy_name)
42
48
  self._request_aggregator: serve_utils.RequestsAggregator = (
43
49
  serve_utils.RequestTimestamp())
44
50
  # TODO(tian): httpx.Client has a resource limit of 100 max connections
@@ -223,9 +229,21 @@ class SkyServeLoadBalancer:
223
229
  uvicorn.run(self._app, host='0.0.0.0', port=self._load_balancer_port)
224
230
 
225
231
 
226
- def run_load_balancer(controller_addr: str, load_balancer_port: int):
227
- load_balancer = SkyServeLoadBalancer(controller_url=controller_addr,
228
- load_balancer_port=load_balancer_port)
232
+ def run_load_balancer(controller_addr: str,
233
+ load_balancer_port: int,
234
+ load_balancing_policy_name: Optional[str] = None) -> None:
235
+ """ Run the load balancer.
236
+
237
+ Args:
238
+ controller_addr: The address of the controller.
239
+ load_balancer_port: The port where the load balancer listens to.
240
+ policy_name: The name of the load balancing policy to use. Defaults to
241
+ None.
242
+ """
243
+ load_balancer = SkyServeLoadBalancer(
244
+ controller_url=controller_addr,
245
+ load_balancer_port=load_balancer_port,
246
+ load_balancing_policy_name=load_balancing_policy_name)
229
247
  load_balancer.run()
230
248
 
231
249
 
@@ -241,5 +259,13 @@ if __name__ == '__main__':
241
259
  required=True,
242
260
  default=8890,
243
261
  help='The port where the load balancer listens to.')
262
+ available_policies = list(lb_policies.LB_POLICIES.keys())
263
+ parser.add_argument(
264
+ '--load-balancing-policy',
265
+ choices=available_policies,
266
+ default='round_robin',
267
+ help=f'The load balancing policy to use. Available policies: '
268
+ f'{", ".join(available_policies)}.')
244
269
  args = parser.parse_args()
245
- run_load_balancer(args.controller_addr, args.load_balancer_port)
270
+ run_load_balancer(args.controller_addr, args.load_balancer_port,
271
+ args.load_balancing_policy)
@@ -10,6 +10,10 @@ if typing.TYPE_CHECKING:
10
10
 
11
11
  logger = sky_logging.init_logger(__name__)
12
12
 
13
+ # Define a registry for load balancing policies
14
+ LB_POLICIES = {}
15
+ DEFAULT_LB_POLICY = None
16
+
13
17
 
14
18
  def _request_repr(request: 'fastapi.Request') -> str:
15
19
  return ('<Request '
@@ -25,6 +29,24 @@ class LoadBalancingPolicy:
25
29
  def __init__(self) -> None:
26
30
  self.ready_replicas: List[str] = []
27
31
 
32
+ def __init_subclass__(cls, name: str, default: bool = False):
33
+ LB_POLICIES[name] = cls
34
+ if default:
35
+ global DEFAULT_LB_POLICY
36
+ assert DEFAULT_LB_POLICY is None, (
37
+ 'Only one policy can be default.')
38
+ DEFAULT_LB_POLICY = name
39
+
40
+ @classmethod
41
+ def make(cls, policy_name: Optional[str] = None) -> 'LoadBalancingPolicy':
42
+ """Create a load balancing policy from a name."""
43
+ if policy_name is None:
44
+ policy_name = DEFAULT_LB_POLICY
45
+
46
+ if policy_name not in LB_POLICIES:
47
+ raise ValueError(f'Unknown load balancing policy: {policy_name}')
48
+ return LB_POLICIES[policy_name]()
49
+
28
50
  def set_ready_replicas(self, ready_replicas: List[str]) -> None:
29
51
  raise NotImplementedError
30
52
 
@@ -44,7 +66,7 @@ class LoadBalancingPolicy:
44
66
  raise NotImplementedError
45
67
 
46
68
 
47
- class RoundRobinPolicy(LoadBalancingPolicy):
69
+ class RoundRobinPolicy(LoadBalancingPolicy, name='round_robin', default=True):
48
70
  """Round-robin load balancing policy."""
49
71
 
50
72
  def __init__(self) -> None: