skypilot-nightly 1.0.0.dev20241029__tar.gz → 1.0.0.dev20241031__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. {skypilot_nightly-1.0.0.dev20241029/skypilot_nightly.egg-info → skypilot_nightly-1.0.0.dev20241031}/PKG-INFO +13 -11
  2. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/setup.py +4 -3
  3. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/__init__.py +2 -2
  4. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/cloud_vm_ray_backend.py +14 -13
  5. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/azure.py +4 -5
  6. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +51 -3
  7. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/utils/gcp_utils.py +0 -8
  8. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/execution.py +5 -4
  9. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/controller.py +38 -22
  10. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/recovery_strategy.py +30 -5
  11. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/state.py +33 -5
  12. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/utils.py +28 -4
  13. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/azure/instance.py +4 -24
  14. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/resources.py +28 -8
  15. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/setup_files/setup.py +4 -3
  16. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/job_lib.py +34 -42
  17. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/azure-ray.yml.j2 +0 -1
  18. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/dag_utils.py +14 -4
  19. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/schemas.py +21 -1
  20. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031/skypilot_nightly.egg-info}/PKG-INFO +13 -11
  21. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/skypilot_nightly.egg-info/requires.txt +12 -10
  22. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_smoke.py +109 -116
  23. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/LICENSE +0 -0
  24. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/MANIFEST.in +0 -0
  25. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/README.md +0 -0
  26. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/pyproject.toml +0 -0
  27. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/setup.cfg +0 -0
  28. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/__init__.py +0 -0
  29. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/aws.py +0 -0
  30. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/azure.py +0 -0
  31. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/cloudflare.py +0 -0
  32. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/common.py +0 -0
  33. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/cudo.py +0 -0
  34. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/docker.py +0 -0
  35. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/gcp.py +0 -0
  36. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/ibm.py +0 -0
  37. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/kubernetes.py +0 -0
  38. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/oci.py +0 -0
  39. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/runpod.py +0 -0
  40. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/adaptors/vsphere.py +0 -0
  41. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/admin_policy.py +0 -0
  42. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/authentication.py +0 -0
  43. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/__init__.py +0 -0
  44. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/backend.py +0 -0
  45. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/backend_utils.py +0 -0
  46. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/docker_utils.py +0 -0
  47. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/local_docker_backend.py +0 -0
  48. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/monkey_patches/monkey_patch_ray_up.py +0 -0
  49. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/backends/wheel_utils.py +0 -0
  50. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/benchmark/__init__.py +0 -0
  51. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/benchmark/benchmark_state.py +0 -0
  52. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/benchmark/benchmark_utils.py +0 -0
  53. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/check.py +0 -0
  54. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/cli.py +0 -0
  55. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/cloud_stores.py +0 -0
  56. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/__init__.py +0 -0
  57. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/aws.py +0 -0
  58. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/cloud.py +0 -0
  59. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/cloud_registry.py +0 -0
  60. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/cudo.py +0 -0
  61. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/fluidstack.py +0 -0
  62. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/gcp.py +0 -0
  63. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/ibm.py +0 -0
  64. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/kubernetes.py +0 -0
  65. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/lambda_cloud.py +0 -0
  66. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/oci.py +0 -0
  67. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/paperspace.py +0 -0
  68. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/runpod.py +0 -0
  69. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/scp.py +0 -0
  70. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/__init__.py +0 -0
  71. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/aws_catalog.py +0 -0
  72. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/azure_catalog.py +0 -0
  73. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/common.py +0 -0
  74. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/config.py +0 -0
  75. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/constants.py +0 -0
  76. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/cudo_catalog.py +0 -0
  77. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/__init__.py +0 -0
  78. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +0 -0
  79. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +0 -0
  80. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +0 -0
  81. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +0 -0
  82. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +0 -0
  83. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +0 -0
  84. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/fluidstack_catalog.py +0 -0
  85. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/gcp_catalog.py +0 -0
  86. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/ibm_catalog.py +0 -0
  87. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/kubernetes_catalog.py +0 -0
  88. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/lambda_catalog.py +0 -0
  89. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/oci_catalog.py +0 -0
  90. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/paperspace_catalog.py +0 -0
  91. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/runpod_catalog.py +0 -0
  92. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/scp_catalog.py +0 -0
  93. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/service_catalog/vsphere_catalog.py +0 -0
  94. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/utils/__init__.py +0 -0
  95. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/utils/aws_utils.py +0 -0
  96. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/utils/azure_utils.py +0 -0
  97. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/utils/oci_utils.py +0 -0
  98. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/utils/scp_utils.py +0 -0
  99. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/clouds/vsphere.py +0 -0
  100. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/core.py +0 -0
  101. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/dag.py +0 -0
  102. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/data/__init__.py +0 -0
  103. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/data/data_transfer.py +0 -0
  104. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/data/data_utils.py +0 -0
  105. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/data/mounting_utils.py +0 -0
  106. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/data/storage.py +0 -0
  107. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/data/storage_utils.py +0 -0
  108. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/exceptions.py +0 -0
  109. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/global_user_state.py +0 -0
  110. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/__init__.py +0 -0
  111. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/constants.py +0 -0
  112. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/core.py +0 -0
  113. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/dashboard/dashboard.py +0 -0
  114. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/dashboard/static/favicon.ico +0 -0
  115. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/jobs/dashboard/templates/index.html +0 -0
  116. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/optimizer.py +0 -0
  117. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/__init__.py +0 -0
  118. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/aws/__init__.py +0 -0
  119. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/aws/config.py +0 -0
  120. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/aws/instance.py +0 -0
  121. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/aws/utils.py +0 -0
  122. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/azure/__init__.py +0 -0
  123. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/azure/azure-config-template.json +0 -0
  124. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/azure/config.py +0 -0
  125. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/common.py +0 -0
  126. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/constants.py +0 -0
  127. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/cudo/__init__.py +0 -0
  128. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/cudo/config.py +0 -0
  129. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/cudo/cudo_machine_type.py +0 -0
  130. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/cudo/cudo_utils.py +0 -0
  131. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/cudo/cudo_wrapper.py +0 -0
  132. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/cudo/instance.py +0 -0
  133. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/docker_utils.py +0 -0
  134. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/fluidstack/__init__.py +0 -0
  135. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/fluidstack/config.py +0 -0
  136. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/fluidstack/fluidstack_utils.py +0 -0
  137. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/fluidstack/instance.py +0 -0
  138. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/gcp/__init__.py +0 -0
  139. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/gcp/config.py +0 -0
  140. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/gcp/constants.py +0 -0
  141. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/gcp/instance.py +0 -0
  142. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/gcp/instance_utils.py +0 -0
  143. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/gcp/mig_utils.py +0 -0
  144. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/instance_setup.py +0 -0
  145. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/__init__.py +0 -0
  146. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/config.py +0 -0
  147. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/instance.py +0 -0
  148. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -0
  149. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -0
  150. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/network.py +0 -0
  151. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/network_utils.py +0 -0
  152. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/kubernetes/utils.py +0 -0
  153. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/lambda_cloud/__init__.py +0 -0
  154. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/lambda_cloud/config.py +0 -0
  155. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/lambda_cloud/instance.py +0 -0
  156. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/lambda_cloud/lambda_utils.py +0 -0
  157. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/logging.py +0 -0
  158. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/metadata_utils.py +0 -0
  159. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/paperspace/__init__.py +0 -0
  160. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/paperspace/config.py +0 -0
  161. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/paperspace/constants.py +0 -0
  162. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/paperspace/instance.py +0 -0
  163. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/paperspace/utils.py +0 -0
  164. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/provisioner.py +0 -0
  165. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/runpod/__init__.py +0 -0
  166. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/runpod/config.py +0 -0
  167. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/runpod/instance.py +0 -0
  168. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/runpod/utils.py +0 -0
  169. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/__init__.py +0 -0
  170. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/__init__.py +0 -0
  171. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/cls_api_client.py +0 -0
  172. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/cls_api_helper.py +0 -0
  173. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/custom_script.py +0 -0
  174. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/id_generator.py +0 -0
  175. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/metadata_utils.py +0 -0
  176. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/service_manager.py +0 -0
  177. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/service_manager_factory.py +0 -0
  178. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/ssl_helper.py +0 -0
  179. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/vapiconnect.py +0 -0
  180. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/common/vim_utils.py +0 -0
  181. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/config.py +0 -0
  182. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/instance.py +0 -0
  183. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/provision/vsphere/vsphere_utils.py +0 -0
  184. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/__init__.py +0 -0
  185. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/autoscalers.py +0 -0
  186. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/constants.py +0 -0
  187. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/controller.py +0 -0
  188. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/core.py +0 -0
  189. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/load_balancer.py +0 -0
  190. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/load_balancing_policies.py +0 -0
  191. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/replica_managers.py +0 -0
  192. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/serve_state.py +0 -0
  193. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/serve_utils.py +0 -0
  194. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/service.py +0 -0
  195. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/serve/service_spec.py +0 -0
  196. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/setup_files/MANIFEST.in +0 -0
  197. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/sky_logging.py +0 -0
  198. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/LICENSE +0 -0
  199. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/__init__.py +0 -0
  200. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/attempt_skylet.py +0 -0
  201. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/autostop_lib.py +0 -0
  202. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/configs.py +0 -0
  203. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/constants.py +0 -0
  204. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/events.py +0 -0
  205. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/log_lib.py +0 -0
  206. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/log_lib.pyi +0 -0
  207. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/__init__.py +0 -0
  208. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/command_runner.py +0 -0
  209. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/ibm/__init__.py +0 -0
  210. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/ibm/node_provider.py +0 -0
  211. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/ibm/utils.py +0 -0
  212. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/ibm/vpc_provider.py +0 -0
  213. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/oci/__init__.py +0 -0
  214. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/oci/node_provider.py +0 -0
  215. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/oci/query_helper.py +0 -0
  216. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/oci/utils.py +0 -0
  217. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/scp/__init__.py +0 -0
  218. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/scp/config.py +0 -0
  219. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/providers/scp/node_provider.py +0 -0
  220. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/__init__.py +0 -0
  221. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/autoscaler.py.patch +0 -0
  222. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/cli.py.patch +0 -0
  223. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/command_runner.py.patch +0 -0
  224. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/log_monitor.py.patch +0 -0
  225. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/resource_demand_scheduler.py.patch +0 -0
  226. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/updater.py.patch +0 -0
  227. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/ray_patches/worker.py.patch +0 -0
  228. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/skylet.py +0 -0
  229. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skylet/subprocess_daemon.py +0 -0
  230. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/skypilot_config.py +0 -0
  231. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/status_lib.py +0 -0
  232. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/task.py +0 -0
  233. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/aws-ray.yml.j2 +0 -0
  234. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/cudo-ray.yml.j2 +0 -0
  235. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/fluidstack-ray.yml.j2 +0 -0
  236. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/gcp-ray.yml.j2 +0 -0
  237. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/ibm-ray.yml.j2 +0 -0
  238. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/jobs-controller.yaml.j2 +0 -0
  239. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/kubernetes-ingress.yml.j2 +0 -0
  240. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/kubernetes-loadbalancer.yml.j2 +0 -0
  241. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/kubernetes-port-forward-proxy-command.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/kubernetes-ray.yml.j2 +0 -0
  243. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/kubernetes-ssh-jump.yml.j2 +0 -0
  244. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/lambda-ray.yml.j2 +0 -0
  245. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/local-ray.yml.j2 +0 -0
  246. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/oci-ray.yml.j2 +0 -0
  247. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/paperspace-ray.yml.j2 +0 -0
  248. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/runpod-ray.yml.j2 +0 -0
  249. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/scp-ray.yml.j2 +0 -0
  250. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/sky-serve-controller.yaml.j2 +0 -0
  251. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/templates/vsphere-ray.yml.j2 +0 -0
  252. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/usage/__init__.py +0 -0
  253. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/usage/constants.py +0 -0
  254. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/usage/usage_lib.py +0 -0
  255. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/__init__.py +0 -0
  256. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/accelerator_registry.py +0 -0
  257. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/admin_policy_utils.py +0 -0
  258. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/cli_utils/__init__.py +0 -0
  259. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/cli_utils/status_utils.py +0 -0
  260. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/cluster_yaml_utils.py +0 -0
  261. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/command_runner.py +0 -0
  262. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/command_runner.pyi +0 -0
  263. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/common_utils.py +0 -0
  264. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/controller_utils.py +0 -0
  265. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/db_utils.py +0 -0
  266. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/env_options.py +0 -0
  267. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/__init__.py +0 -0
  268. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/create_cluster.sh +0 -0
  269. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/delete_cluster.sh +0 -0
  270. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/deploy_remote_cluster.sh +0 -0
  271. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/generate_kind_config.py +0 -0
  272. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/generate_kubeconfig.sh +0 -0
  273. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/gpu_labeler.py +0 -0
  274. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +0 -0
  275. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +0 -0
  276. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/rsync_helper.sh +0 -0
  277. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -0
  278. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/kubernetes_enums.py +0 -0
  279. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/log_utils.py +0 -0
  280. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/resources_utils.py +0 -0
  281. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/rich_utils.py +0 -0
  282. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/subprocess_utils.py +0 -0
  283. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/timeline.py +0 -0
  284. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/ux_utils.py +0 -0
  285. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/sky/utils/validator.py +0 -0
  286. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/skypilot_nightly.egg-info/SOURCES.txt +0 -0
  287. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/skypilot_nightly.egg-info/dependency_links.txt +0 -0
  288. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/skypilot_nightly.egg-info/entry_points.txt +0 -0
  289. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/skypilot_nightly.egg-info/top_level.txt +0 -0
  290. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_api.py +0 -0
  291. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_cli.py +0 -0
  292. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_config.py +0 -0
  293. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_global_user_state.py +0 -0
  294. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_jobs.py +0 -0
  295. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_jobs_and_serve.py +0 -0
  296. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_list_accelerators.py +0 -0
  297. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_optimizer_dryruns.py +0 -0
  298. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_optimizer_random_dag.py +0 -0
  299. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_serve_autoscaler.py +0 -0
  300. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_storage.py +0 -0
  301. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_wheels.py +0 -0
  302. {skypilot_nightly-1.0.0.dev20241029 → skypilot_nightly-1.0.0.dev20241031}/tests/test_yaml_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241029
3
+ Version: 1.0.0.dev20241031
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -47,11 +47,12 @@ Requires-Dist: botocore>=1.29.10; extra == "aws"
47
47
  Requires-Dist: boto3>=1.26.1; extra == "aws"
48
48
  Requires-Dist: colorama<0.4.5; extra == "aws"
49
49
  Provides-Extra: azure
50
- Requires-Dist: azure-cli>=2.31.0; extra == "azure"
51
- Requires-Dist: azure-core; extra == "azure"
52
- Requires-Dist: azure-identity>=1.13.0; extra == "azure"
53
- Requires-Dist: azure-mgmt-network; extra == "azure"
54
- Requires-Dist: azure-storage-blob; extra == "azure"
50
+ Requires-Dist: azure-cli>=2.65.0; extra == "azure"
51
+ Requires-Dist: azure-core>=1.31.0; extra == "azure"
52
+ Requires-Dist: azure-identity>=1.19.0; extra == "azure"
53
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "azure"
54
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
55
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
55
56
  Requires-Dist: msgraph-sdk; extra == "azure"
56
57
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
57
58
  Provides-Extra: gcp
@@ -102,11 +103,12 @@ Requires-Dist: awscli>=1.27.10; extra == "all"
102
103
  Requires-Dist: botocore>=1.29.10; extra == "all"
103
104
  Requires-Dist: boto3>=1.26.1; extra == "all"
104
105
  Requires-Dist: colorama<0.4.5; extra == "all"
105
- Requires-Dist: azure-cli>=2.31.0; extra == "all"
106
- Requires-Dist: azure-core; extra == "all"
107
- Requires-Dist: azure-identity>=1.13.0; extra == "all"
108
- Requires-Dist: azure-mgmt-network; extra == "all"
109
- Requires-Dist: azure-storage-blob; extra == "all"
106
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
107
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
108
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
109
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
110
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
111
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
110
112
  Requires-Dist: msgraph-sdk; extra == "all"
111
113
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
112
114
  Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
@@ -153,7 +153,7 @@ install_requires = [
153
153
  'tabulate',
154
154
  # Light weight requirement, can be replaced with "typing" once
155
155
  # we deprecate Python 3.7 (this will take a while).
156
- "typing_extensions",
156
+ 'typing_extensions',
157
157
  'filelock >= 3.6.0',
158
158
  'packaging',
159
159
  'psutil',
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
216
216
  # We need azure-identity>=1.13.0 to enable the customization of the
217
217
  # timeout of AzureCliCredential.
218
218
  'azure': [
219
- 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0',
220
- 'azure-mgmt-network', 'azure-storage-blob', 'msgraph-sdk'
219
+ 'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
220
+ 'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
221
+ 'azure-storage-blob>=12.23.1', 'msgraph-sdk'
221
222
  ] + local_ray,
222
223
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
223
224
  # parameter for stopping instances.
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '47ebae73e972c65de6e87aa7556220e515f2fc5e'
8
+ _SKYPILOT_COMMIT_SHA = 'c4eeeb5fb3ef64be0f05a727e119ac9266f8940f'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241029'
38
+ __version__ = '1.0.0.dev20241031'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -1950,17 +1950,8 @@ class RetryingVmProvisioner(object):
1950
1950
 
1951
1951
  failover_history: List[Exception] = list()
1952
1952
 
1953
- style = colorama.Style
1954
- fore = colorama.Fore
1955
1953
  # Retrying launchable resources.
1956
1954
  while True:
1957
- if (isinstance(to_provision.cloud, clouds.Azure) and
1958
- to_provision.accelerators is not None and
1959
- 'A10' in to_provision.accelerators and prev_handle is None):
1960
- logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
1961
- 'an A10 cluster on Azure. This may take ~20 '
1962
- 'minutes due to driver installation.'
1963
- f'{style.RESET_ALL}')
1964
1955
  try:
1965
1956
  # Recheck cluster name as the 'except:' block below may
1966
1957
  # change the cloud assignment.
@@ -2476,7 +2467,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2476
2467
  """Returns number of IPs per node in the cluster, handling TPU Pod."""
2477
2468
  is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
2478
2469
  if is_tpu_vm_pod:
2479
- num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
2470
+ num_ips = len(self.internal_ips())
2480
2471
  else:
2481
2472
  num_ips = 1
2482
2473
  return num_ips
@@ -3175,9 +3166,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3175
3166
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3176
3167
  if returncode == 255:
3177
3168
  is_message_too_long = False
3178
- with open(setup_log_path, 'r', encoding='utf-8') as f:
3179
- if 'too long' in f.read():
3180
- is_message_too_long = True
3169
+ try:
3170
+ with open(os.path.expanduser(setup_log_path),
3171
+ 'r',
3172
+ encoding='utf-8') as f:
3173
+ if 'too long' in f.read():
3174
+ is_message_too_long = True
3175
+ except Exception as e: # pylint: disable=broad-except
3176
+ # We don't crash the setup if we cannot read the log file.
3177
+ # Instead, we should retry the setup with dumping the script
3178
+ # to a file to be safe.
3179
+ logger.debug('Failed to read setup log file '
3180
+ f'{setup_log_path}: {e}')
3181
+ is_message_too_long = True
3181
3182
 
3182
3183
  if is_message_too_long:
3183
3184
  # If the setup script is too long, we retry it with dumping
@@ -44,6 +44,8 @@ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
44
44
  _DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
45
45
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
46
46
  _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
47
+ # This is used by Azure GPU VMs that use grid drivers (e.g. A10).
48
+ _DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'
47
49
 
48
50
  _COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
49
51
 
@@ -220,6 +222,8 @@ class Azure(clouds.Cloud):
220
222
  acc_name = list(acc.keys())[0]
221
223
  if acc_name == 'K80':
222
224
  return _DEFAULT_GPU_K80_IMAGE_ID
225
+ if acc_name == 'A10':
226
+ return _DEFAULT_GPU_GRID_IMAGE_ID
223
227
  # About Gen V1 vs V2:
224
228
  # In Azure, all instances with K80 (Standard_NC series), some
225
229
  # instances with M60 (Standard_NV series) and some cpu instances
@@ -350,10 +354,6 @@ class Azure(clouds.Cloud):
350
354
  'image_version': version,
351
355
  }
352
356
 
353
- # Setup the A10 nvidia driver.
354
- need_nvidia_driver_extension = (acc_dict is not None and
355
- 'A10' in acc_dict)
356
-
357
357
  # Determine resource group for deploying the instance.
358
358
  resource_group_name = skypilot_config.get_nested(
359
359
  ('azure', 'resource_group_vm'), None)
@@ -413,7 +413,6 @@ class Azure(clouds.Cloud):
413
413
  # Azure does not support specific zones.
414
414
  'zones': None,
415
415
  **image_config,
416
- 'need_nvidia_driver_extension': need_nvidia_driver_extension,
417
416
  'disk_tier': Azure._get_disk_type(disk_tier),
418
417
  'cloud_init_setup_commands': cloud_init_setup_commands,
419
418
  'azure_subscription_id': self.get_project_id(dryrun),
@@ -47,6 +47,10 @@ TPU_RETRY_CNT = 3
47
47
  TPU_V4_ZONES = ['us-central2-b']
48
48
  # TPU v3 pods are available in us-east1-d, but hidden in the skus.
49
49
  # We assume the TPU prices are the same as us-central1.
50
+ # TPU v6e's pricing info is not available on the SKUs. However, in
51
+ # https://cloud.google.com/tpu/pricing, it listed the price for 4 regions:
52
+ # us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here
53
+ # and filtered out the other regions (us-central{1,2}, us-south1).
50
54
  HIDDEN_TPU_DF = pd.read_csv(
51
55
  io.StringIO(
52
56
  textwrap.dedent("""\
@@ -58,8 +62,50 @@ HIDDEN_TPU_DF = pd.read_csv(
58
62
  ,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d
59
63
  ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
60
64
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
65
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b
66
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c
67
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a
68
+ ,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b
69
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d
70
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b
71
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c
72
+ ,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a
73
+ ,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b
74
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d
75
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b
76
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c
77
+ ,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a
78
+ ,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b
79
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d
80
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b
81
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c
82
+ ,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a
83
+ ,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b
84
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d
85
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b
86
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c
87
+ ,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a
88
+ ,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b
89
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d
90
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b
91
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c
92
+ ,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a
93
+ ,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b
94
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d
95
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b
96
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c
97
+ ,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a
98
+ ,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b
99
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d
100
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b
101
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c
102
+ ,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a
103
+ ,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b
104
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d
61
105
  """)))
62
106
 
107
+ TPU_V6E_MISSING_REGIONS = ['us-central1', 'us-central2', 'us-south1']
108
+
63
109
  # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
64
110
  # NOTE(dev): Keep the zones and the df in sync.
65
111
  TPU_V5_MISSING_ZONES_DF = {
@@ -683,11 +729,13 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
683
729
  'not found in SKUs or hidden TPU price DF.')
684
730
  # TODO(tian): Hack. Should investigate how to retrieve the price
685
731
  # for TPU-v6e.
686
- if not tpu_name.startswith('tpu-v6e'):
732
+ if (tpu_name.startswith('tpu-v6e') and
733
+ tpu_region in TPU_V6E_MISSING_REGIONS):
734
+ if not spot:
735
+ tpu_price = 0.0
736
+ else:
687
737
  assert spot or tpu_price is not None, (row, hidden_tpu,
688
738
  HIDDEN_TPU_DF)
689
- else:
690
- tpu_price = 0.0
691
739
  return tpu_price
692
740
 
693
741
  df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
@@ -49,14 +49,6 @@ def is_tpu_vm_pod(resources: Optional['resources_lib.Resources']) -> bool:
49
49
  return not acc.endswith('-8')
50
50
 
51
51
 
52
- def get_num_tpu_devices(resources: Optional['resources_lib.Resources']) -> int:
53
- if resources is None or not is_tpu(resources):
54
- raise ValueError('resources must be a valid TPU resource.')
55
- acc, _ = list(resources.accelerators.items())[0]
56
- num_tpu_devices = int(int(acc.split('-')[2]) / 8)
57
- return num_tpu_devices
58
-
59
-
60
52
  @dataclasses.dataclass
61
53
  class SpecificReservation:
62
54
  count: int
@@ -171,10 +171,11 @@ def _execute(
171
171
  task = dag.tasks[0]
172
172
 
173
173
  if any(r.job_recovery is not None for r in task.resources):
174
- with ux_utils.print_exception_no_traceback():
175
- raise ValueError(
176
- 'Job recovery is specified in the task. To launch a '
177
- 'managed job, please use: sky jobs launch')
174
+ logger.warning(
175
+ f'{colorama.Style.DIM}The task has `job_recovery` specified, '
176
+ 'but is launched as an unmanaged job. It will be ignored.'
177
+ 'To enable job recovery, use managed jobs: sky jobs launch.'
178
+ f'{colorama.Style.RESET_ALL}')
178
179
 
179
180
  cluster_exists = False
180
181
  if cluster_name is not None:
@@ -160,6 +160,11 @@ class JobsController:
160
160
  if task_id == 0:
161
161
  submitted_at = backend_utils.get_timestamp_from_run_timestamp(
162
162
  self._backend.run_timestamp)
163
+ assert task.name is not None, task
164
+ cluster_name = managed_job_utils.generate_managed_job_cluster_name(
165
+ task.name, self._job_id)
166
+ self._strategy_executor = recovery_strategy.StrategyExecutor.make(
167
+ cluster_name, self._backend, task, self._retry_until_up)
163
168
  managed_job_state.set_submitted(
164
169
  self._job_id,
165
170
  task_id,
@@ -167,15 +172,14 @@ class JobsController:
167
172
  submitted_at,
168
173
  resources_str=backend_utils.get_task_resources_str(
169
174
  task, is_managed_job=True),
175
+ specs={
176
+ 'max_restarts_on_errors':
177
+ self._strategy_executor.max_restarts_on_errors
178
+ },
170
179
  callback_func=callback_func)
171
180
  logger.info(
172
181
  f'Submitted managed job {self._job_id} (task: {task_id}, name: '
173
182
  f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
174
- assert task.name is not None, task
175
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
176
- task.name, self._job_id)
177
- self._strategy_executor = recovery_strategy.StrategyExecutor.make(
178
- cluster_name, self._backend, task, self._retry_until_up)
179
183
 
180
184
  logger.info('Started monitoring.')
181
185
  managed_job_state.set_starting(job_id=self._job_id,
@@ -283,23 +287,35 @@ class JobsController:
283
287
  failure_reason = (
284
288
  'To see the details, run: '
285
289
  f'sky jobs logs --controller {self._job_id}')
286
-
287
- managed_job_state.set_failed(
288
- self._job_id,
289
- task_id,
290
- failure_type=managed_job_status,
291
- failure_reason=failure_reason,
292
- end_time=end_time,
293
- callback_func=callback_func)
294
- return False
295
- # Although the cluster is healthy, we fail to access the
296
- # job status. Try to recover the job (will not restart the
297
- # cluster, if the cluster is healthy).
298
- assert job_status is None, job_status
299
- logger.info('Failed to fetch the job status while the '
300
- 'cluster is healthy. Try to recover the job '
301
- '(the cluster will not be restarted).')
302
-
290
+ should_restart_on_failure = (
291
+ self._strategy_executor.should_restart_on_failure())
292
+ if should_restart_on_failure:
293
+ max_restarts = (
294
+ self._strategy_executor.max_restarts_on_errors)
295
+ logger.info(
296
+ f'User program crashed '
297
+ f'({managed_job_status.value}). '
298
+ f'Retry the job as max_restarts_on_errors is '
299
+ f'set to {max_restarts}. '
300
+ f'[{self._strategy_executor.restart_cnt_on_failure}'
301
+ f'/{max_restarts}]')
302
+ else:
303
+ managed_job_state.set_failed(
304
+ self._job_id,
305
+ task_id,
306
+ failure_type=managed_job_status,
307
+ failure_reason=failure_reason,
308
+ end_time=end_time,
309
+ callback_func=callback_func)
310
+ return False
311
+ else:
312
+ # Although the cluster is healthy, we fail to access the
313
+ # job status. Try to recover the job (will not restart the
314
+ # cluster, if the cluster is healthy).
315
+ assert job_status is None, job_status
316
+ logger.info('Failed to fetch the job status while the '
317
+ 'cluster is healthy. Try to recover the job '
318
+ '(the cluster will not be restarted).')
303
319
  # When the handle is None, the cluster should be cleaned up already.
304
320
  if handle is not None:
305
321
  resources = handle.launched_resources
@@ -66,7 +66,8 @@ class StrategyExecutor:
66
66
  RETRY_INIT_GAP_SECONDS = 60
67
67
 
68
68
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
69
- task: 'task_lib.Task', retry_until_up: bool) -> None:
69
+ task: 'task_lib.Task', retry_until_up: bool,
70
+ max_restarts_on_errors: int) -> None:
70
71
  """Initialize the strategy executor.
71
72
 
72
73
  Args:
@@ -82,6 +83,8 @@ class StrategyExecutor:
82
83
  self.cluster_name = cluster_name
83
84
  self.backend = backend
84
85
  self.retry_until_up = retry_until_up
86
+ self.max_restarts_on_errors = max_restarts_on_errors
87
+ self.restart_cnt_on_failure = 0
85
88
 
86
89
  def __init_subclass__(cls, name: str, default: bool = False):
87
90
  RECOVERY_STRATEGIES[name] = cls
@@ -109,8 +112,17 @@ class StrategyExecutor:
109
112
  # set the new_task_resources to be the same type (list or set) as the
110
113
  # original task.resources
111
114
  task.set_resources(type(task.resources)(new_resources_list))
112
- return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
113
- retry_until_up)
115
+ if isinstance(job_recovery, dict):
116
+ job_recovery_name = job_recovery.pop('strategy',
117
+ DEFAULT_RECOVERY_STRATEGY)
118
+ max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
119
+ 0)
120
+ else:
121
+ job_recovery_name = job_recovery
122
+ max_restarts_on_errors = 0
123
+ return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
124
+ task, retry_until_up,
125
+ max_restarts_on_errors)
114
126
 
115
127
  def launch(self) -> float:
116
128
  """Launch the cluster for the first time.
@@ -368,6 +380,17 @@ class StrategyExecutor:
368
380
  f'{gap_seconds:.1f} seconds.')
369
381
  time.sleep(gap_seconds)
370
382
 
383
+ def should_restart_on_failure(self) -> bool:
384
+ """Increments counter & checks if job should be restarted on a failure.
385
+
386
+ Returns:
387
+ True if the job should be restarted, otherwise False.
388
+ """
389
+ self.restart_cnt_on_failure += 1
390
+ if self.restart_cnt_on_failure > self.max_restarts_on_errors:
391
+ return False
392
+ return True
393
+
371
394
 
372
395
  class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
373
396
  default=False):
@@ -376,8 +399,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
376
399
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
377
400
 
378
401
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
379
- task: 'task_lib.Task', retry_until_up: bool) -> None:
380
- super().__init__(cluster_name, backend, task, retry_until_up)
402
+ task: 'task_lib.Task', retry_until_up: bool,
403
+ max_restarts_on_errors: int) -> None:
404
+ super().__init__(cluster_name, backend, task, retry_until_up,
405
+ max_restarts_on_errors)
381
406
  # Note down the cloud/region of the launched cluster, so that we can
382
407
  # first retry in the same cloud/region. (Inside recover() we may not
383
408
  # rely on cluster handle, as it can be None if the cluster is
@@ -2,6 +2,7 @@
2
2
  # TODO(zhwu): maybe use file based status instead of database, so
3
3
  # that we can easily switch to a s3-based storage.
4
4
  import enum
5
+ import json
5
6
  import pathlib
6
7
  import sqlite3
7
8
  import time
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
65
66
  failure_reason TEXT,
66
67
  spot_job_id INTEGER,
67
68
  task_id INTEGER DEFAULT 0,
68
- task_name TEXT)""")
69
+ task_name TEXT,
70
+ specs TEXT)""")
69
71
  _CONN.commit()
70
72
 
71
73
  db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
92
94
  'TEXT',
93
95
  copy_from='job_name')
94
96
 
97
+ # Specs is some useful information about the task, e.g., the
98
+ # max_restarts_on_errors value. It is stored in JSON format.
99
+ db_utils.add_column_to_table(_CURSOR,
100
+ _CONN,
101
+ 'spot',
102
+ 'specs',
103
+ 'TEXT',
104
+ value_to_replace_existing_entries=json.dumps({
105
+ 'max_restarts_on_errors': 0,
106
+ }))
107
+
95
108
  # `job_info` contains the mapping from job_id to the job_name.
96
109
  # In the future, it may contain more information about each job.
97
110
  _CURSOR.execute("""\
@@ -128,9 +141,10 @@ columns = [
128
141
  'job_id',
129
142
  'task_id',
130
143
  'task_name',
144
+ 'specs',
131
145
  # columns from the job_info table
132
146
  '_job_info_job_id', # This should be the same as job_id
133
- 'job_name'
147
+ 'job_name',
134
148
  ]
135
149
 
136
150
 
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
283
297
 
284
298
  def set_submitted(job_id: int, task_id: int, run_timestamp: str,
285
299
  submit_time: float, resources_str: str,
286
- callback_func: CallbackType):
300
+ specs: Dict[str, Union[str,
301
+ int]], callback_func: CallbackType):
287
302
  """Set the task to submitted.
288
303
 
289
304
  Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
293
308
  determine the log directory of the managed task.
294
309
  submit_time: The time when the managed task is submitted.
295
310
  resources_str: The resources string of the managed task.
311
+ specs: The specs of the managed task.
312
+ callback_func: The callback function.
296
313
  """
297
314
  # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
298
315
  # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
306
323
  resources=(?),
307
324
  submitted_at=(?),
308
325
  status=(?),
309
- run_timestamp=(?)
326
+ run_timestamp=(?),
327
+ specs=(?)
310
328
  WHERE spot_job_id=(?) AND
311
329
  task_id=(?)""",
312
330
  (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
313
- run_timestamp, job_id, task_id))
331
+ run_timestamp, json.dumps(specs), job_id, task_id))
314
332
  callback_func('SUBMITTED')
315
333
 
316
334
 
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
619
637
  for (job_id,) in rows:
620
638
  return job_id
621
639
  return None
640
+
641
+
642
+ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
643
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
644
+ task_specs = cursor.execute(
645
+ """\
646
+ SELECT specs FROM spot
647
+ WHERE spot_job_id=(?) AND task_id=(?)""",
648
+ (job_id, task_id)).fetchone()
649
+ return json.loads(task_specs[0])
@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
70
70
  # state, after the job finished. This is a safeguard to avoid the case where
71
71
  # the managed job status fails to be updated and keep the `sky jobs logs`
72
72
  # blocking for a long time.
73
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
73
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
74
74
 
75
75
 
76
76
  class UserSignal(enum.Enum):
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
392
392
  f'INFO: Log for the current task ({task_id}) '
393
393
  'is finished. Waiting for the next task\'s log '
394
394
  'to be started.')
395
- status_display.update('Waiting for the next task: '
396
- f'{task_id + 1}.')
395
+ # Add a newline to avoid the status display below
396
+ # removing the last line of the task output.
397
+ print()
398
+ status_display.update(
399
+ ux_utils.spinner_message(
400
+ f'Waiting for the next task: {task_id + 1}'))
397
401
  status_display.start()
398
402
  original_task_id = task_id
399
403
  while True:
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
405
409
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
406
410
  continue
407
411
  else:
408
- break
412
+ task_specs = managed_job_state.get_task_specs(
413
+ job_id, task_id)
414
+ if task_specs.get('max_restarts_on_errors', 0) == 0:
415
+ # We don't need to wait for the managed job status
416
+ # update, as the job is guaranteed to be in terminal
417
+ # state afterwards.
418
+ break
419
+ print()
420
+ status_display.update(
421
+ ux_utils.spinner_message(
422
+ 'Waiting for next restart for the failed task'))
423
+ status_display.start()
424
+ while True:
425
+ _, managed_job_status = (
426
+ managed_job_state.get_latest_task_id_status(
427
+ job_id))
428
+ if (managed_job_status !=
429
+ managed_job_state.ManagedJobStatus.RUNNING):
430
+ break
431
+ time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
432
+ continue
409
433
  # The job can be cancelled by the user or the controller (when
410
434
  # the cluster is partially preempted).
411
435
  logger.debug(
@@ -311,30 +311,10 @@ def _create_vm(
311
311
  vm_name=vm_name,
312
312
  parameters=vm_instance,
313
313
  )
314
- # poller.result() will block on async operation until it's done.
315
- logger.info(f'Created VM {vm_poller.result().name}.')
316
- # Configure driver extension for A10 GPUs. A10 GPUs requires a
317
- # special type of drivers which is available at Microsoft HPC
318
- # extension. Reference:
319
- # https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
320
- # This can take more than 20mins for setting up the A10 GPUs
321
- if node_config.get('need_nvidia_driver_extension', False):
322
- ext_poller = compute_client.virtual_machine_extensions.\
323
- begin_create_or_update(
324
- resource_group_name=provider_config['resource_group'],
325
- vm_name=vm_name,
326
- vm_extension_name='NvidiaGpuDriverLinux',
327
- extension_parameters=compute.VirtualMachineExtension(
328
- location=provider_config['location'],
329
- publisher='Microsoft.HpcCompute',
330
- type_properties_type='NvidiaGpuDriverLinux',
331
- type_handler_version='1.9',
332
- auto_upgrade_minor_version=True,
333
- settings='{}'))
334
- logger.info(
335
- f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
336
- )
337
- return vm_poller.result()
314
+ # This line will block until the VM is created or the operation times out.
315
+ vm = vm_poller.result()
316
+ logger.info(f'Created VM {vm.name}.')
317
+ return vm
338
318
 
339
319
 
340
320
  def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',