skypilot-nightly 1.0.0.dev20241109__py3-none-any.whl → 1.0.0.dev20241110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,11 @@
1
- sky/__init__.py,sha256=vuaxCFFtQHJTriSEGG_wKshl6nmhDcnt70q66x1rkvA,5882
1
+ sky/__init__.py,sha256=j3vy9X4XOYIefQk15d_c6Q_mpDjII9Nltso4xgrFI1o,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
5
  sky/cli.py,sha256=jEjXs5Z0u263eJIsTHoKyG9oOY6giqw19s2di9kEv1s,212088
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
8
- sky/dag.py,sha256=O9g8NnO8L1SGUEDyqW9W341AH4Wvd3nJs54niR-pkrk,2822
8
+ sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
9
9
  sky/exceptions.py,sha256=E3C2Ejcc8RUDAUQn7ar_Jr97C_AxD2rKKMmJOfLJ9d0,8965
10
10
  sky/execution.py,sha256=TwcorzFxR_0m8uazPdeKltU3g3ikgUSqqzcSBrHp7K4,26070
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
33
  sky/backends/backend_utils.py,sha256=2myfryj1zG9xxPaX6XYYJruxAOGNGbpsy2ckT4A77sE,121813
34
- sky/backends/cloud_vm_ray_backend.py,sha256=6Ew9Ej92KGlumlCnyDcGSEbHInj7g2Shqwx4oxRkWVQ,233122
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=cL-IDyk9AOmHTAiQbXVwEr4dX6KPx4M-GiVEXxUYPWQ,232147
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=CUVOwlBtQjOMv-RSDGx2jMQ0M1D0w9ZPm0TDafJwBDI,8180
@@ -50,7 +50,7 @@ sky/clouds/gcp.py,sha256=BjCehW3s0IYkRDdEEDm0vYWXQDpOV8KU98OMVRPnQNg,54676
50
50
  sky/clouds/ibm.py,sha256=w8bo1EIY_YWYNu0fy-OpAyr6DZviU0tpIXUsiV01rVE,21423
51
51
  sky/clouds/kubernetes.py,sha256=tYjQFatOQmgtRzMt3J54CxM0w2ZPQwAo5SyyYkBcW9Y,28657
52
52
  sky/clouds/lambda_cloud.py,sha256=ExL_uixdFrF9qSL5JYXpaOXCZ9_eOA2q444kcmBHBXQ,12644
53
- sky/clouds/oci.py,sha256=sHJrVhUhOKvJ-skbd2ZJ82IR63OXp43krmyPpM8BZqw,27084
53
+ sky/clouds/oci.py,sha256=NOH-yYi1fbMkjqoz39zVXUEexE9MjE1c7YTvGtUgKzQ,26663
54
54
  sky/clouds/paperspace.py,sha256=4cjNua6jpuxmfidvLY4tSRX1oj_QaaHDinPMunGplyU,10868
55
55
  sky/clouds/runpod.py,sha256=_4myVdGIvQshkka8fn6mBXHgz5TZqhrNhAEM2_HrCT8,11487
56
56
  sky/clouds/scp.py,sha256=NivPvzQxA90R37QR3fgTup8ScGfxKsXAhH0xklAj5QU,15817
@@ -67,7 +67,7 @@ sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOi
67
67
  sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
68
68
  sky/clouds/service_catalog/kubernetes_catalog.py,sha256=5ilQ-JK1ZS2EZp8GpCKok0H3S1fdI_aAznzIDWCY1NY,9110
69
69
  sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
70
- sky/clouds/service_catalog/oci_catalog.py,sha256=DQaP0iQlxZEHWJs862ilynUfUEQDIjCGltS7kSadgYo,8572
70
+ sky/clouds/service_catalog/oci_catalog.py,sha256=cyA6ZqwHGOKuPxUl_dKmFGdeWdQGMrvl_-o2MtyF998,8580
71
71
  sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
72
72
  sky/clouds/service_catalog/runpod_catalog.py,sha256=oWYVgSMiK3DxBE5AgROyExIq9kCTaOr3hDLSc31kqTU,4205
73
73
  sky/clouds/service_catalog/scp_catalog.py,sha256=nrtD0hAZd1rUDsFuHI1hrBgAVSE5YprdWoYSXQooIqU,5195
@@ -85,7 +85,7 @@ sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
86
86
  sky/clouds/utils/azure_utils.py,sha256=NToRBnhEyuUvb-nBnsKTxjhOBRkMcrelL8LK4w6s4t8,3555
87
87
  sky/clouds/utils/gcp_utils.py,sha256=QejfgXOIVRv5-fv3Soi96VeVNVyquwVwy3M58N3YfNs,6633
88
- sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
88
+ sky/clouds/utils/oci_utils.py,sha256=LILpS38_exeMjmJdNpzwDR8hfGSpWjaRKl1CWKA-zHs,5579
89
89
  sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
90
90
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
91
91
  sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,7346
@@ -103,7 +103,7 @@ sky/jobs/utils.py,sha256=Ff3TttIEdVeM1_kOVkviqIDjeVfBPIXVE8i-yP1VDM8,37976
103
103
  sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
105
  sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
106
- sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
106
+ sky/provision/__init__.py,sha256=llAtnAAzx0TKT17B0JL_2ZiKea9RRQRxSzkWHQYqWTo,6292
107
107
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
108
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
109
109
  sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc,19199
@@ -147,6 +147,10 @@ sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X
147
147
  sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
148
148
  sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
149
149
  sky/provision/lambda_cloud/lambda_utils.py,sha256=wIXV1Qe362f8Q9u8DSx2e9IJs4CF03Jr3idHCzhlRz4,9879
150
+ sky/provision/oci/__init__.py,sha256=5E6EUtTK3mqGVREw5TuVl5DxteBYTZigIii7c8gHExU,612
151
+ sky/provision/oci/config.py,sha256=diSDTyHLokcuXGB2XgZCHFvsXa8bah1PP2XuMouW_UU,1650
152
+ sky/provision/oci/instance.py,sha256=Y7z7N8sTpnzznL_GAtBeErzrF7r-zd9BZ7ZnC9DjFQg,16649
153
+ sky/provision/oci/query_utils.py,sha256=SUVOVRawFslEfkIRPqe8_pLYJRiGQvKpQ77-LRf9kgI,20304
150
154
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
151
155
  sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
152
156
  sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
@@ -183,7 +187,7 @@ sky/serve/serve_state.py,sha256=Q7De4GoBEPxlN_t1Lpn-Y1fd94SeHZ3E-94f1OTuhpc,1908
183
187
  sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,39431
184
188
  sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
185
189
  sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
186
- sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
190
+ sky/setup_files/MANIFEST.in,sha256=WF0T89NLichHxZDDSQzvSpiONtAEFyur2MPmGczgTIo,555
187
191
  sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
188
192
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
189
193
  sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -192,7 +196,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
192
196
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
197
  sky/skylet/constants.py,sha256=w05Enrg9RhGp99P1WDYMKK_ki0M-e0bS8Wr-VZR0Vn8,14468
194
198
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=FD1n9vE0daOEUKSH3lnccfBh7Vs81R8s4ILZyKu2o7M,37275
199
+ sky/skylet/job_lib.py,sha256=aY2qqZGA59hVTp6FtP3N_Wkrl8wzO8XFOOjhODpQGZg,37737
196
200
  sky/skylet/log_lib.py,sha256=BmhAgcLvlin3szhj33IH0kbdCALacVisF2x61BQpZdY,21888
197
201
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
202
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -203,10 +207,6 @@ sky/skylet/providers/ibm/__init__.py,sha256=GXo5F9ztvs0qMDI_G9wM5KvzySfYugslJMHH
203
207
  sky/skylet/providers/ibm/node_provider.py,sha256=olNtCoCxjXTT-C_youwdQ9UF1DPgO8OVwDueotGFaJI,38280
204
208
  sky/skylet/providers/ibm/utils.py,sha256=63vhKqLLOhAZdibSp8VWWONeyCER9F6U2VLrSpzlizk,1292
205
209
  sky/skylet/providers/ibm/vpc_provider.py,sha256=GiOGlWYqqeBETfAeKqVj2-9shsMSP7z1WnO8UP5JTNo,34630
206
- sky/skylet/providers/oci/__init__.py,sha256=LRMTj6OhQoxiFJw4uNxG8cn6PllP8A-lGJL3Cs5DJok,91
207
- sky/skylet/providers/oci/node_provider.py,sha256=YPqiRag_cysvYMIMDGbMn6lOumvHad6FLJB5DGPr00Q,20492
208
- sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gMj6R7Qel2fDc,17202
209
- sky/skylet/providers/oci/utils.py,sha256=lCpdklxgSwK-hqErTicpIe_xkpSlIc8u943C-9_MJfU,508
210
210
  sky/skylet/providers/scp/__init__.py,sha256=15SiAh1YphXkZsHySaw_CeAmXRdoM4JtNIAt7SLbUvg,91
211
211
  sky/skylet/providers/scp/config.py,sha256=lhMXyG9btMlg59nmvtnMdIDN07jBbQOheAx-bHbGbhw,5077
212
212
  sky/skylet/providers/scp/node_provider.py,sha256=W5J-170JVIpwT9Fv20fJ_PpdAVsqx9pigE-RkkG_kQE,22459
@@ -232,7 +232,7 @@ sky/templates/kubernetes-ray.yml.j2,sha256=dsWlkX-0b1igeZI4c0u0Jzia5I_9gezCiewR6
232
232
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
233
233
  sky/templates/lambda-ray.yml.j2,sha256=HyvO_tX2vxwSsc4IFVSqGuIbjLMk0bevP9bcxb8ZQII,4498
234
234
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
235
- sky/templates/oci-ray.yml.j2,sha256=E-xnadts-x88vYRI1QGFzgfGGKFospmo2N9d_0cPN5I,7144
235
+ sky/templates/oci-ray.yml.j2,sha256=92dvXGaUd2Kwep9fgTjOsAPJiBLr8GQTjy7pFvuPAyE,4562
236
236
  sky/templates/paperspace-ray.yml.j2,sha256=HQjZNamrB_a4fOMCxQXSVdV5JIHtbGtAE0JzEO8uuVQ,4021
237
237
  sky/templates/runpod-ray.yml.j2,sha256=p3BtYBHzROtNJqnjEo1xCmGSJQfCZYdarWszhDYyl0Q,3697
238
238
  sky/templates/scp-ray.yml.j2,sha256=I9u8Ax-lit-d6UrCC9BVU8avst8w1cwK6TrzZBcz_JM,5608
@@ -275,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
275
275
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
276
276
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
277
277
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
278
- skypilot_nightly-1.0.0.dev20241109.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
- skypilot_nightly-1.0.0.dev20241109.dist-info/METADATA,sha256=YM8C71GXOj5CoHQlj5yNYhL8UkZ75DL-qMMTPXCOmXY,19708
280
- skypilot_nightly-1.0.0.dev20241109.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
- skypilot_nightly-1.0.0.dev20241109.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
- skypilot_nightly-1.0.0.dev20241109.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
- skypilot_nightly-1.0.0.dev20241109.dist-info/RECORD,,
278
+ skypilot_nightly-1.0.0.dev20241110.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
+ skypilot_nightly-1.0.0.dev20241110.dist-info/METADATA,sha256=4ar4pUczmGqsEHMG-85ANcAB_ifYgIDJRr0BJfypruA,19708
280
+ skypilot_nightly-1.0.0.dev20241110.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
281
+ skypilot_nightly-1.0.0.dev20241110.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
+ skypilot_nightly-1.0.0.dev20241110.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
+ skypilot_nightly-1.0.0.dev20241110.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- """OCI node provider"""
2
- from sky.skylet.providers.oci.node_provider import OCINodeProvider
@@ -1,488 +0,0 @@
1
- """OCI Node Provider.
2
-
3
- Node provider is called by the Ray Autoscaler to provision new compute
4
- resources (head / worker nodes).
5
-
6
- To show debug messages, export SKYPILOT_DEBUG=1
7
-
8
- History:
9
- - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
10
-
11
- """
12
-
13
- import copy
14
- from datetime import datetime
15
- import logging
16
- import threading
17
- import time
18
-
19
- from ray.autoscaler.node_provider import NodeProvider
20
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
21
- from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG
22
- from ray.autoscaler.tags import TAG_RAY_NODE_KIND
23
- from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
24
-
25
- from sky.adaptors import oci as oci_adaptor
26
- from sky.clouds.utils import oci_utils
27
- from sky.skylet.providers.oci import utils
28
- from sky.skylet.providers.oci.query_helper import oci_query_helper
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- def synchronized(f):
34
-
35
- def wrapper(self, *args, **kwargs):
36
- self.lock.acquire()
37
- try:
38
- return f(self, *args, **kwargs)
39
- finally:
40
- self.lock.release()
41
-
42
- return wrapper
43
-
44
-
45
- class OCINodeProvider(NodeProvider):
46
- """Node Provider for OracleCloud (OCI)."""
47
-
48
- def __init__(self, provider_config, cluster_name):
49
- NodeProvider.__init__(self, provider_config, cluster_name)
50
- self.lock = threading.RLock()
51
- self.cached_nodes = {}
52
- self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
53
- True)
54
- self.region = provider_config["region"]
55
-
56
- # Do a read-ahead cache loading to improve performance.
57
- self._get_filtered_nodes({})
58
-
59
- @synchronized
60
- def _get_filtered_nodes(self, tag_filters, force=False):
61
- # Make sure the cluster_name is always an criterion
62
- tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name}
63
-
64
- return_nodes = {}
65
- if not force:
66
- # Query cache first to reduce API call.
67
- cache_hit = False
68
- for k, node in self.cached_nodes.items():
69
- tags = node["tags"]
70
- unmatched_tags = [
71
- k for k, v in tag_filters.items()
72
- if k not in tags or v != tags[k]
73
- ]
74
- if len(unmatched_tags) == 0:
75
- return_nodes[k] = node
76
- cache_hit |= True
77
-
78
- if cache_hit:
79
- return return_nodes
80
-
81
- insts = oci_query_helper.query_instances_by_tags(
82
- tag_filters, self.region)
83
- for inst in insts:
84
- inst_id = inst.identifier
85
- if inst_id in self.cached_nodes:
86
- del self.cached_nodes[inst_id]
87
-
88
- item = self.get_inst_obj({
89
- "inst_id": inst_id,
90
- "ad": inst.availability_domain,
91
- "compartment": inst.compartment_id,
92
- "lifecycle_state": inst.lifecycle_state,
93
- "oci_tags": inst.freeform_tags,
94
- })
95
- return_nodes[inst_id] = item
96
- self.cached_nodes[inst_id] = item
97
-
98
- return return_nodes
99
-
100
- @utils.debug_enabled(logger=logger)
101
- def non_terminated_nodes(self, tag_filters):
102
- """Return a list of node ids filtered by the specified tags dict.
103
-
104
- This list must not include terminated nodes. For performance reasons,
105
- providers are allowed to cache the result of a call to
106
- non_terminated_nodes() to serve single-node queries
107
- (e.g. is_running(node_id)). This means that non_terminated_nodes()
108
- must be called again to refresh results.
109
- """
110
- VALIDITY_TAGS = [
111
- TAG_RAY_CLUSTER_NAME,
112
- TAG_RAY_NODE_KIND,
113
- TAG_RAY_USER_NODE_TYPE,
114
- TAG_RAY_LAUNCH_CONFIG,
115
- ]
116
- filters = {
117
- tag: tag_filters[tag] for tag in VALIDITY_TAGS if tag in tag_filters
118
- }
119
-
120
- nodes = self._get_filtered_nodes(tag_filters=filters)
121
- return [k for k, v in nodes.items() if v["status"] == "RUNNING"]
122
-
123
- @utils.debug_enabled(logger=logger)
124
- def is_running(self, node_id):
125
- """Return whether the specified node is running."""
126
- node = self._get_cached_node(node_id=node_id)
127
- check_result = node is None or node["status"] == "RUNNING"
128
-
129
- return check_result
130
-
131
- @utils.debug_enabled(logger=logger)
132
- def is_terminated(self, node_id):
133
- """Return whether the specified node is terminated."""
134
- node = self._get_cached_node(node_id=node_id)
135
- check_result = ((node is None) or (node["status"] == "TERMINATED") or
136
- (node["status"] == "TERMINATING"))
137
-
138
- return check_result
139
-
140
- @utils.debug_enabled(logger=logger)
141
- def node_tags(self, node_id):
142
- return self.cached_nodes[node_id]["tags"]
143
-
144
- @utils.debug_enabled(logger=logger)
145
- def external_ip(self, node_id):
146
- """Returns the external ip of the given node."""
147
- return self._get_cached_node(node_id=node_id)["external_ip"]
148
-
149
- @utils.debug_enabled(logger=logger)
150
- def internal_ip(self, node_id):
151
- """Returns the internal ip (Ray ip) of the given node."""
152
- return self._get_cached_node(node_id=node_id)["internal_ip"]
153
-
154
- @synchronized
155
- @utils.debug_enabled(logger=logger)
156
- def create_node(self, node_config, tags, count):
157
- """Creates a number of nodes within the namespace."""
158
- start_time = round(time.time() * 1000)
159
- starting_insts = []
160
- # Check first if it neccessary to create new nodes / start stopped nodes
161
- VALIDITY_TAGS = [
162
- TAG_RAY_CLUSTER_NAME,
163
- TAG_RAY_NODE_KIND,
164
- TAG_RAY_USER_NODE_TYPE,
165
- ]
166
- filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
167
-
168
- # Starting stopped nodes if cache_stopped_nodes=True
169
- if self.cache_stopped_nodes:
170
- logger.debug("Checking existing stopped nodes.")
171
-
172
- filters_with_launch_config = copy.copy(filters)
173
- if TAG_RAY_LAUNCH_CONFIG in tags:
174
- filters_with_launch_config[TAG_RAY_LAUNCH_CONFIG] = tags[
175
- TAG_RAY_LAUNCH_CONFIG]
176
-
177
- nodes_matching_launch_config = self.stopped_nodes(
178
- filters_with_launch_config)
179
- logger.debug(f"Found stopped nodes (with same launch config): "
180
- f"{len(nodes_matching_launch_config)}")
181
-
182
- reuse_nodes = []
183
- if len(nodes_matching_launch_config) >= count:
184
- reuse_nodes = nodes_matching_launch_config[:count]
185
- else:
186
- nodes_all = self.stopped_nodes(filters)
187
- logger.debug(f"Found stopped nodes (regardless launch config): "
188
- f"{len(nodes_all)}")
189
- nodes_matching_launch_config_ids = [
190
- n["id"] for n in nodes_matching_launch_config
191
- ]
192
- nodes_non_matching_launch_config = [
193
- n for n in nodes_all
194
- if n["id"] not in nodes_matching_launch_config_ids
195
- ]
196
- reuse_nodes = (nodes_matching_launch_config +
197
- nodes_non_matching_launch_config)
198
- reuse_nodes = reuse_nodes[:count]
199
-
200
- logger.info(
201
- f"Reusing nodes {len(reuse_nodes)}: {list(reuse_nodes)}. "
202
- "To disable reuse, set `cache_stopped_nodes: False` "
203
- "under `provider` in the cluster configuration.",)
204
-
205
- for reuse_node in reuse_nodes:
206
- if reuse_node["status"] == "STOPPING":
207
- get_instance_response = oci_adaptor.get_core_client(
208
- self.region,
209
- oci_utils.oci_config.get_profile()).get_instance(
210
- instance_id=reuse_node["id"])
211
- oci_adaptor.oci.wait_until(
212
- oci_adaptor.get_core_client(
213
- self.region, oci_utils.oci_config.get_profile()),
214
- get_instance_response,
215
- "lifecycle_state",
216
- "STOPPED",
217
- )
218
-
219
- start_time1 = round(time.time() * 1000)
220
- for matched_node in reuse_nodes:
221
- matched_node_id = matched_node["id"]
222
- instance_action_response = oci_adaptor.get_core_client(
223
- self.region,
224
- oci_utils.oci_config.get_profile()).instance_action(
225
- instance_id=matched_node_id, action="START")
226
-
227
- starting_inst = instance_action_response.data
228
- starting_insts.append({
229
- "inst_id": starting_inst.id,
230
- "ad": starting_inst.availability_domain,
231
- "compartment": starting_inst.compartment_id,
232
- "lifecycle_state": starting_inst.lifecycle_state,
233
- "oci_tags": starting_inst.freeform_tags,
234
- })
235
- count -= len(reuse_nodes)
236
-
237
- launch_stopped_time = round(time.time() * 1000) - start_time1
238
- logger.debug(
239
- "Time elapsed(Launch stopped): {0} milli-seconds.".format(
240
- launch_stopped_time))
241
- # end if self.cache_stopped_nodes:...
242
-
243
- # Let's create additional new nodes (if neccessary)
244
- if count > 0:
245
- compartment = oci_query_helper.find_compartment(self.region)
246
- vcn = oci_query_helper.find_create_vcn_subnet(self.region)
247
- if vcn is None:
248
- raise RuntimeError("VcnSubnetNotFound Error!")
249
-
250
- ocpu_count = 0
251
- vcpu_str = node_config["VCPUs"]
252
- instance_type_str = node_config["InstanceType"]
253
-
254
- if vcpu_str is not None and vcpu_str != "None":
255
- if instance_type_str.startswith(
256
- f"{oci_utils.oci_config.VM_PREFIX}.A"):
257
- # For ARM cpu, 1*ocpu = 1*vcpu
258
- ocpu_count = round(float(vcpu_str))
259
- else:
260
- # For Intel / AMD cpu, 1*ocpu = 2*vcpu
261
- ocpu_count = round(float(vcpu_str) / 2)
262
- ocpu_count = 1 if (ocpu_count > 0 and
263
- ocpu_count < 1) else ocpu_count
264
-
265
- machine_shape_config = None
266
- if ocpu_count > 0:
267
- mem = node_config["MemoryInGbs"]
268
- if mem is not None and mem != "None":
269
- machine_shape_config = (oci_adaptor.oci.core.models.
270
- LaunchInstanceShapeConfigDetails(
271
- ocpus=ocpu_count,
272
- memory_in_gbs=mem))
273
- else:
274
- machine_shape_config = (oci_adaptor.oci.core.models.
275
- LaunchInstanceShapeConfigDetails(
276
- ocpus=ocpu_count))
277
-
278
- preempitible_config = (
279
- oci_adaptor.oci.core.models.PreemptibleInstanceConfigDetails(
280
- preemption_action=oci_adaptor.oci.core.models.
281
- TerminatePreemptionAction(type="TERMINATE",
282
- preserve_boot_volume=False))
283
- if node_config["Preemptible"] else None)
284
-
285
- logger.debug(f"Shape: {instance_type_str}, ocpu: {ocpu_count}")
286
- logger.debug(f"Shape config is {machine_shape_config}")
287
- logger.debug(f"Spot config is {preempitible_config}")
288
-
289
- vm_tags = {
290
- **tags,
291
- TAG_RAY_CLUSTER_NAME: self.cluster_name,
292
- "sky_spot_flag": str(node_config["Preemptible"]).lower(),
293
- }
294
- # Use UTC time so that header & worker nodes use same rule
295
- batch_id = datetime.utcnow().strftime("%Y%m%d%H%M%S")
296
- node_type = tags[TAG_RAY_NODE_KIND]
297
-
298
- oci_query_helper.subscribe_image(
299
- compartment_id=compartment,
300
- listing_id=node_config["AppCatalogListingId"],
301
- resource_version=node_config["ResourceVersion"],
302
- region=self.region,
303
- )
304
-
305
- start_time1 = round(time.time() * 1000)
306
- for seq in range(1, count + 1):
307
- launch_instance_response = oci_adaptor.get_core_client(
308
- self.region, oci_utils.oci_config.get_profile()
309
- ).launch_instance(
310
- launch_instance_details=oci_adaptor.oci.core.models.
311
- LaunchInstanceDetails(
312
- availability_domain=node_config["AvailabilityDomain"],
313
- compartment_id=compartment,
314
- shape=instance_type_str,
315
- display_name=
316
- f"{self.cluster_name}_{node_type}_{batch_id}_{seq}",
317
- freeform_tags=vm_tags,
318
- metadata={
319
- "ssh_authorized_keys": node_config["AuthorizedKey"]
320
- },
321
- source_details=oci_adaptor.oci.core.models.
322
- InstanceSourceViaImageDetails(
323
- source_type="image",
324
- image_id=node_config["ImageId"],
325
- boot_volume_size_in_gbs=node_config[
326
- "BootVolumeSize"],
327
- boot_volume_vpus_per_gb=int(
328
- node_config["BootVolumePerf"]),
329
- ),
330
- create_vnic_details=oci_adaptor.oci.core.models.
331
- CreateVnicDetails(
332
- assign_public_ip=True,
333
- subnet_id=vcn,
334
- ),
335
- shape_config=machine_shape_config,
336
- preemptible_instance_config=preempitible_config,
337
- ))
338
-
339
- new_inst = launch_instance_response.data
340
- starting_insts.append({
341
- "inst_id": new_inst.id,
342
- "ad": new_inst.availability_domain,
343
- "compartment": new_inst.compartment_id,
344
- "lifecycle_state": new_inst.lifecycle_state,
345
- "oci_tags": new_inst.freeform_tags,
346
- })
347
- # end for loop
348
-
349
- launch_new_time = round(time.time() * 1000) - start_time1
350
- logger.debug("Time elapsed(Launch): {0} milli-seconds.".format(
351
- launch_new_time))
352
- # end if count > 0:...
353
-
354
- for ninst in starting_insts:
355
- # Waiting for the instance to be RUNNING state
356
- get_instance_response = oci_adaptor.get_core_client(
357
- self.region, oci_utils.oci_config.get_profile()).get_instance(
358
- instance_id=ninst["inst_id"])
359
- oci_adaptor.oci.wait_until(
360
- oci_adaptor.get_core_client(self.region,
361
- oci_utils.oci_config.get_profile()),
362
- get_instance_response,
363
- "lifecycle_state",
364
- "RUNNING",
365
- )
366
- ninst["lifecycle_state"] = "RUNNING"
367
- self.cached_nodes[ninst["inst_id"]] = self.get_inst_obj(ninst)
368
-
369
- total_time = round(time.time() * 1000) - start_time
370
- logger.debug(
371
- "Total time elapsed: {0} milli-seconds.".format(total_time))
372
-
373
- def get_inst_obj(self, inst_info):
374
- list_vnic_attachments_response = oci_adaptor.get_core_client(
375
- self.region,
376
- oci_utils.oci_config.get_profile()).list_vnic_attachments(
377
- availability_domain=inst_info["ad"],
378
- compartment_id=inst_info["compartment"],
379
- instance_id=inst_info["inst_id"],
380
- )
381
-
382
- vnic = list_vnic_attachments_response.data[0]
383
- get_vnic_response = (oci_adaptor.get_net_client(
384
- self.region, oci_utils.oci_config.get_profile()).get_vnic(
385
- vnic_id=vnic.vnic_id).data)
386
-
387
- internal_ip = get_vnic_response.private_ip
388
- external_ip = get_vnic_response.public_ip
389
- if external_ip is None:
390
- external_ip = internal_ip
391
-
392
- return {
393
- "id": inst_info["inst_id"],
394
- "external_ip": external_ip,
395
- "internal_ip": internal_ip,
396
- "tags": inst_info["oci_tags"],
397
- "status": inst_info["lifecycle_state"],
398
- }
399
-
400
- @synchronized
401
- @utils.debug_enabled(logger=logger)
402
- def set_node_tags(self, node_id, tags):
403
- existing_tags = self._get_cached_node(node_id)["tags"]
404
- combined_tags = dict(existing_tags, **tags)
405
-
406
- self.cached_nodes[node_id]["tags"] = combined_tags
407
- retry_count = 0
408
- while retry_count < oci_utils.oci_config.MAX_RETRY_COUNT:
409
- try:
410
- oci_adaptor.get_core_client(
411
- self.region,
412
- oci_utils.oci_config.get_profile()).update_instance(
413
- instance_id=node_id,
414
- update_instance_details=oci_adaptor.oci.core.models.
415
- UpdateInstanceDetails(freeform_tags=combined_tags),
416
- )
417
- logger.info(f"Tags are well set for node {node_id}")
418
- break
419
- except Exception as e:
420
- retry_count = retry_count + 1
421
- wait_seconds = oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS * retry_count
422
- logger.warn(
423
- f"Not ready yet, wait {wait_seconds} seconds & retry!")
424
- logger.warn(f"Exception message is {str(e)}")
425
- time.sleep(wait_seconds)
426
-
427
- @synchronized
428
- def terminate_node(self, node_id):
429
- """Terminates the specified node."""
430
- logger.info(f"terminate_node {node_id}...")
431
- node = self._get_cached_node(node_id)
432
- if node is None:
433
- logger.info(f"The node is not existed: {node_id}..")
434
- return # Node not exists yet.
435
-
436
- logger.debug(f"sky_spot_flag: {node['tags']['sky_spot_flag']}")
437
- preemptibleFlag = (True if node and
438
- (str(node["tags"]["sky_spot_flag"]) == "true") else
439
- False)
440
-
441
- if self.cache_stopped_nodes and not preemptibleFlag:
442
- logger.info(f"Stopping instance {node_id}"
443
- "(to fully terminate instead, "
444
- "set `cache_stopped_nodes: False` "
445
- "under `provider` in the cluster configuration)")
446
- instance_action_response = oci_adaptor.get_core_client(
447
- self.region,
448
- oci_utils.oci_config.get_profile()).instance_action(
449
- instance_id=node_id, action="STOP")
450
- logger.info(
451
- f"Stopped the instance {instance_action_response.data.id}")
452
- if node_id in self.cached_nodes:
453
- self.cached_nodes[node_id]["status"] = "STOPPED"
454
- state_word = "Stopped"
455
- else:
456
- terminate_instance_response = oci_adaptor.get_core_client(
457
- self.region,
458
- oci_utils.oci_config.get_profile()).terminate_instance(node_id)
459
- logger.debug(terminate_instance_response.data)
460
- if node_id in self.cached_nodes:
461
- del self.cached_nodes[node_id]
462
- state_word = "Terminated"
463
-
464
- logger.info(
465
- f"{state_word} {node_id} w/ sky_spot_flag: {preemptibleFlag}.")
466
-
467
- def _get_node(self, node_id):
468
- self._get_filtered_nodes({},
469
- force=True) # All except for those terminated.
470
- return self.cached_nodes.get(node_id, None)
471
-
472
- def _get_cached_node(self, node_id):
473
- if node_id in self.cached_nodes:
474
- return self.cached_nodes[node_id]
475
- return self._get_node(node_id=node_id)
476
-
477
- def stopped_nodes(self, tag_filters):
478
- """Return a list of stopped nodes filtered by the specified tags dict."""
479
- nodes = self._get_filtered_nodes(tag_filters=tag_filters, force=True)
480
- return [
481
- v for _, v in nodes.items()
482
- if v["status"] in ("STOPPED", "STOPPING")
483
- ]
484
-
485
- def running_nodes(self, tag_filters):
486
- """Return a list of running node ids filtered by the specified tags dict."""
487
- nodes = self._get_filtered_nodes(tag_filters=tag_filters)
488
- return [k for k, v in nodes.items() if v["status"] == "RUNNING"]
@@ -1,21 +0,0 @@
1
- from datetime import datetime
2
- import functools
3
- from logging import Logger
4
-
5
-
6
- def debug_enabled(logger: Logger):
7
-
8
- def decorate(f):
9
-
10
- @functools.wraps(f)
11
- def wrapper(*args, **kwargs):
12
- dt_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
13
- logger.debug(f"{dt_str} Enter {f}, {args}, {kwargs}")
14
- try:
15
- return f(*args, **kwargs)
16
- finally:
17
- logger.debug(f"{dt_str} Exit {f}")
18
-
19
- return wrapper
20
-
21
- return decorate